text_clean 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +5 -0
- data/Rakefile +11 -0
- data/ext/text_clean/text_clean.cc +3 -3
- data/lib/text_clean/version.rb +1 -1
- data/spec/text_clean_spec.rb +61 -0
- data/text_clean.gemspec +1 -0
- metadata +19 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e529b66807a01385f10fc896e173880ebe10925
|
4
|
+
data.tar.gz: d4c095af593c650788cc54795ca12a76c695a652
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c45e5724b4f8e5598fdeee84869564b737e9ac77ff30c08e756fec9adec5f148f150cf5026dacd0dd16e011357ff02c02d4ceeb61b0a3725400b7e23a98b786
|
7
|
+
data.tar.gz: 15ff835dff5b83c95e71d87e4d483ac54bf9978c8c66d38e67d738b577c0a24a140c5e38528fd5fd0c6af8f60a5c5ddb057b40c43012cc0bf845d37fcd6a1c7d
|
data/.gitignore
ADDED
data/Rakefile
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
|
2
|
+
require 'rspec'
|
3
|
+
require 'rspec/core/rake_task'
|
4
|
+
|
5
|
+
# `rake test`
|
6
|
+
desc 'Run all examples'
|
7
|
+
RSpec::Core::RakeTask.new(:test) do |t|
|
8
|
+
t.pattern = 'spec/**/*_spec.rb'
|
9
|
+
end
|
10
|
+
|
11
|
+
# `rake compile`
|
1
12
|
require "rake/extensiontask"
|
2
13
|
|
3
14
|
Rake::ExtensionTask.new "text_clean" do |ext|
|
@@ -52,7 +52,7 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
|
|
52
52
|
just_added_period = false;
|
53
53
|
}
|
54
54
|
} else {
|
55
|
-
// scan ahead to see if this is
|
55
|
+
// scan ahead to see if this hyphen is at the end of the line
|
56
56
|
char* scan_ahead;
|
57
57
|
for (scan_ahead = read + 1; scan_ahead < eos; scan_ahead++) {
|
58
58
|
char s = *scan_ahead;
|
@@ -74,11 +74,11 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
|
|
74
74
|
*write++ = line_sep;
|
75
75
|
just_added_period = true;
|
76
76
|
just_added_space = false;
|
77
|
-
} else if (c == ' ' && !just_added_space && !just_added_period) {
|
77
|
+
} else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_period) {
|
78
78
|
*write++ = ' ';
|
79
79
|
just_added_space = true;
|
80
80
|
just_added_period = false;
|
81
|
-
} else if (c >= 'a' && c <= 'z') {
|
81
|
+
} else if (c == '\'' || (c >= 'a' && c <= 'z')) {
|
82
82
|
*write++ = c;
|
83
83
|
just_added_space = false;
|
84
84
|
just_added_period = false;
|
data/lib/text_clean/version.rb
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'text_clean'
|
3
|
+
|
4
|
+
def clean(text, line_sep="\n", &block)
|
5
|
+
TextClean.clean(text, line_sep)
|
6
|
+
block.call(text)
|
7
|
+
end
|
8
|
+
|
9
|
+
def clean_eq(text, expected_after_cleaning, line_sep="\n")
|
10
|
+
clean(text, line_sep) { |t| expect(t).to eq(expected_after_cleaning) }
|
11
|
+
end
|
12
|
+
|
13
|
+
describe TextClean do
|
14
|
+
it "lowercases text" do
|
15
|
+
clean_eq("ALL UPPERCASE", "all uppercase")
|
16
|
+
end
|
17
|
+
|
18
|
+
it "compacts whitespace" do
|
19
|
+
clean_eq("is\t it so?", "is it so\n")
|
20
|
+
end
|
21
|
+
|
22
|
+
it "treats [,/&] as whitespace" do
|
23
|
+
clean_eq("a,bb&cc/d", "a bb cc d")
|
24
|
+
end
|
25
|
+
|
26
|
+
it "treats [;:!?] as sentence separators" do
|
27
|
+
clean_eq("x;y?z!:q", "x.y.z.q", ".")
|
28
|
+
end
|
29
|
+
|
30
|
+
it "joins hyphenated words at line end" do
|
31
|
+
clean_eq("satis-\nfaction", "satisfaction")
|
32
|
+
end
|
33
|
+
|
34
|
+
it "ignores whitespace after hyphen at line end" do
|
35
|
+
clean_eq("satis- \t \nfaction", "satisfaction")
|
36
|
+
end
|
37
|
+
|
38
|
+
it "treats a double hyphen as word separator" do
|
39
|
+
clean_eq("good--do it", "good do it")
|
40
|
+
end
|
41
|
+
|
42
|
+
it "compacts whitespace around double hyphen" do
|
43
|
+
clean_eq("good -- do it", "good do it")
|
44
|
+
end
|
45
|
+
|
46
|
+
it "treats newlines as word separator" do
|
47
|
+
clean_eq("a\nb\nc", "a b c")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "ignores numbers" do
|
51
|
+
clean_eq("123abc", "abc")
|
52
|
+
end
|
53
|
+
|
54
|
+
it "ignores quotes and braces" do
|
55
|
+
clean_eq("[jim] \"speaks\"", "jim speaks")
|
56
|
+
end
|
57
|
+
|
58
|
+
it "keeps apostrophes" do
|
59
|
+
clean_eq("dad's", "dad's")
|
60
|
+
end
|
61
|
+
end
|
data/text_clean.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_clean
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Duane Johnson
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '3.4'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.0'
|
69
83
|
description: Cleans text by removing punctuation, lowercasing. Very fast.
|
70
84
|
email:
|
71
85
|
- duane.johnson@gmail.com
|
@@ -76,6 +90,7 @@ extensions:
|
|
76
90
|
- ext/text_clean/extconf.rb
|
77
91
|
extra_rdoc_files: []
|
78
92
|
files:
|
93
|
+
- ".gitignore"
|
79
94
|
- Gemfile
|
80
95
|
- Rakefile
|
81
96
|
- bin/text_clean
|
@@ -84,6 +99,7 @@ files:
|
|
84
99
|
- ext/text_clean/text_clean.cc
|
85
100
|
- lib/text_clean.rb
|
86
101
|
- lib/text_clean/version.rb
|
102
|
+
- spec/text_clean_spec.rb
|
87
103
|
- text_clean.gemspec
|
88
104
|
homepage: https://github.com/wordtreefoundation
|
89
105
|
licenses:
|
@@ -110,4 +126,5 @@ rubygems_version: 2.2.2
|
|
110
126
|
signing_key:
|
111
127
|
specification_version: 4
|
112
128
|
summary: Text cleaner
|
113
|
-
test_files:
|
129
|
+
test_files:
|
130
|
+
- spec/text_clean_spec.rb
|