text_clean 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -0
- data/Rakefile +11 -0
- data/ext/text_clean/text_clean.cc +3 -3
- data/lib/text_clean/version.rb +1 -1
- data/spec/text_clean_spec.rb +61 -0
- data/text_clean.gemspec +1 -0
- metadata +19 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e529b66807a01385f10fc896e173880ebe10925
|
4
|
+
data.tar.gz: d4c095af593c650788cc54795ca12a76c695a652
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c45e5724b4f8e5598fdeee84869564b737e9ac77ff30c08e756fec9adec5f148f150cf5026dacd0dd16e011357ff02c02d4ceeb61b0a3725400b7e23a98b786
|
7
|
+
data.tar.gz: 15ff835dff5b83c95e71d87e4d483ac54bf9978c8c66d38e67d738b577c0a24a140c5e38528fd5fd0c6af8f60a5c5ddb057b40c43012cc0bf845d37fcd6a1c7d
|
data/.gitignore
ADDED
data/Rakefile
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
|
2
|
+
require 'rspec'
|
3
|
+
require 'rspec/core/rake_task'
|
4
|
+
|
5
|
+
# `rake test`
|
6
|
+
desc 'Run all examples'
|
7
|
+
RSpec::Core::RakeTask.new(:test) do |t|
|
8
|
+
t.pattern = 'spec/**/*_spec.rb'
|
9
|
+
end
|
10
|
+
|
11
|
+
# `rake compile`
|
1
12
|
require "rake/extensiontask"
|
2
13
|
|
3
14
|
Rake::ExtensionTask.new "text_clean" do |ext|
|
@@ -52,7 +52,7 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
|
|
52
52
|
just_added_period = false;
|
53
53
|
}
|
54
54
|
} else {
|
55
|
-
// scan ahead to see if this is
|
55
|
+
// scan ahead to see if this hyphen is at the end of the line
|
56
56
|
char* scan_ahead;
|
57
57
|
for (scan_ahead = read + 1; scan_ahead < eos; scan_ahead++) {
|
58
58
|
char s = *scan_ahead;
|
@@ -74,11 +74,11 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
|
|
74
74
|
*write++ = line_sep;
|
75
75
|
just_added_period = true;
|
76
76
|
just_added_space = false;
|
77
|
-
} else if (c == ' ' && !just_added_space && !just_added_period) {
|
77
|
+
} else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_period) {
|
78
78
|
*write++ = ' ';
|
79
79
|
just_added_space = true;
|
80
80
|
just_added_period = false;
|
81
|
-
} else if (c >= 'a' && c <= 'z') {
|
81
|
+
} else if (c == '\'' || (c >= 'a' && c <= 'z')) {
|
82
82
|
*write++ = c;
|
83
83
|
just_added_space = false;
|
84
84
|
just_added_period = false;
|
data/lib/text_clean/version.rb
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'text_clean'
|
3
|
+
|
4
|
+
def clean(text, line_sep="\n", &block)
|
5
|
+
TextClean.clean(text, line_sep)
|
6
|
+
block.call(text)
|
7
|
+
end
|
8
|
+
|
9
|
+
def clean_eq(text, expected_after_cleaning, line_sep="\n")
|
10
|
+
clean(text, line_sep) { |t| expect(t).to eq(expected_after_cleaning) }
|
11
|
+
end
|
12
|
+
|
13
|
+
describe TextClean do
|
14
|
+
it "lowercases text" do
|
15
|
+
clean_eq("ALL UPPERCASE", "all uppercase")
|
16
|
+
end
|
17
|
+
|
18
|
+
it "compacts whitespace" do
|
19
|
+
clean_eq("is\t it so?", "is it so\n")
|
20
|
+
end
|
21
|
+
|
22
|
+
it "treats [,/&] as whitespace" do
|
23
|
+
clean_eq("a,bb&cc/d", "a bb cc d")
|
24
|
+
end
|
25
|
+
|
26
|
+
it "treats [;:!?] as sentence separators" do
|
27
|
+
clean_eq("x;y?z!:q", "x.y.z.q", ".")
|
28
|
+
end
|
29
|
+
|
30
|
+
it "joins hyphenated words at line end" do
|
31
|
+
clean_eq("satis-\nfaction", "satisfaction")
|
32
|
+
end
|
33
|
+
|
34
|
+
it "ignores whitespace after hyphen at line end" do
|
35
|
+
clean_eq("satis- \t \nfaction", "satisfaction")
|
36
|
+
end
|
37
|
+
|
38
|
+
it "treats a double hyphen as word separator" do
|
39
|
+
clean_eq("good--do it", "good do it")
|
40
|
+
end
|
41
|
+
|
42
|
+
it "compacts whitespace around double hyphen" do
|
43
|
+
clean_eq("good -- do it", "good do it")
|
44
|
+
end
|
45
|
+
|
46
|
+
it "treats newlines as word separator" do
|
47
|
+
clean_eq("a\nb\nc", "a b c")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "ignores numbers" do
|
51
|
+
clean_eq("123abc", "abc")
|
52
|
+
end
|
53
|
+
|
54
|
+
it "ignores quotes and braces" do
|
55
|
+
clean_eq("[jim] \"speaks\"", "jim speaks")
|
56
|
+
end
|
57
|
+
|
58
|
+
it "keeps apostrophes" do
|
59
|
+
clean_eq("dad's", "dad's")
|
60
|
+
end
|
61
|
+
end
|
data/text_clean.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_clean
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Duane Johnson
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '3.4'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.0'
|
69
83
|
description: Cleans text by removing punctuation, lowercasing. Very fast.
|
70
84
|
email:
|
71
85
|
- duane.johnson@gmail.com
|
@@ -76,6 +90,7 @@ extensions:
|
|
76
90
|
- ext/text_clean/extconf.rb
|
77
91
|
extra_rdoc_files: []
|
78
92
|
files:
|
93
|
+
- ".gitignore"
|
79
94
|
- Gemfile
|
80
95
|
- Rakefile
|
81
96
|
- bin/text_clean
|
@@ -84,6 +99,7 @@ files:
|
|
84
99
|
- ext/text_clean/text_clean.cc
|
85
100
|
- lib/text_clean.rb
|
86
101
|
- lib/text_clean/version.rb
|
102
|
+
- spec/text_clean_spec.rb
|
87
103
|
- text_clean.gemspec
|
88
104
|
homepage: https://github.com/wordtreefoundation
|
89
105
|
licenses:
|
@@ -110,4 +126,5 @@ rubygems_version: 2.2.2
|
|
110
126
|
signing_key:
|
111
127
|
specification_version: 4
|
112
128
|
summary: Text cleaner
|
113
|
-
test_files:
|
129
|
+
test_files:
|
130
|
+
- spec/text_clean_spec.rb
|