text_clean 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 73936fded6a20b6fdde25a06bd5cb74b830177c1
4
- data.tar.gz: 7fc77b21e64fad053e1e56f1245ddf70d7b59195
3
+ metadata.gz: 2e529b66807a01385f10fc896e173880ebe10925
4
+ data.tar.gz: d4c095af593c650788cc54795ca12a76c695a652
5
5
  SHA512:
6
- metadata.gz: 596c250db5028aaaaa71c7290e160706e08365b63ba6d9b9aed79ecc03ccea965e6380dfe7709e578fd4ac6f32a733c29d6d07bfdee60a3a41d863bd94bf936f
7
- data.tar.gz: 399449bc2bb5c8fafed2a435446488592ad62eb05eda659d1dd4f8338c07280091bcf604f482725b46367b3daa7e1baac73b16530c02d5950f26cfb9f03ca3c3
6
+ metadata.gz: 8c45e5724b4f8e5598fdeee84869564b737e9ac77ff30c08e756fec9adec5f148f150cf5026dacd0dd16e011357ff02c02d4ceeb61b0a3725400b7e23a98b786
7
+ data.tar.gz: 15ff835dff5b83c95e71d87e4d483ac54bf9978c8c66d38e67d738b577c0a24a140c5e38528fd5fd0c6af8f60a5c5ddb057b40c43012cc0bf845d37fcd6a1c7d
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ Gemfile.lock
2
+ lib/text_clean/text_clean.bundle
3
+ lib/text_clean/text_clean.so
4
+ *.gem
5
+ tmp/
data/Rakefile CHANGED
@@ -1,3 +1,14 @@
1
+
2
+ require 'rspec'
3
+ require 'rspec/core/rake_task'
4
+
5
+ # `rake test`
6
+ desc 'Run all examples'
7
+ RSpec::Core::RakeTask.new(:test) do |t|
8
+ t.pattern = 'spec/**/*_spec.rb'
9
+ end
10
+
11
+ # `rake compile`
1
12
  require "rake/extensiontask"
2
13
 
3
14
  Rake::ExtensionTask.new "text_clean" do |ext|
@@ -52,7 +52,7 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
52
52
  just_added_period = false;
53
53
  }
54
54
  } else {
55
- // scan ahead to see if this is a hyphen at the end of the line
55
+ // scan ahead to see if this hyphen is at the end of the line
56
56
  char* scan_ahead;
57
57
  for (scan_ahead = read + 1; scan_ahead < eos; scan_ahead++) {
58
58
  char s = *scan_ahead;
@@ -74,11 +74,11 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
74
74
  *write++ = line_sep;
75
75
  just_added_period = true;
76
76
  just_added_space = false;
77
- } else if (c == ' ' && !just_added_space && !just_added_period) {
77
+ } else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_period) {
78
78
  *write++ = ' ';
79
79
  just_added_space = true;
80
80
  just_added_period = false;
81
- } else if (c >= 'a' && c <= 'z') {
81
+ } else if (c == '\'' || (c >= 'a' && c <= 'z')) {
82
82
  *write++ = c;
83
83
  just_added_space = false;
84
84
  just_added_period = false;
@@ -1,3 +1,3 @@
1
1
  module TextClean
2
- VERSION = '0.1.1'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -0,0 +1,61 @@
1
+ require 'rspec'
2
+ require 'text_clean'
3
+
4
+ def clean(text, line_sep="\n", &block)
5
+ TextClean.clean(text, line_sep)
6
+ block.call(text)
7
+ end
8
+
9
+ def clean_eq(text, expected_after_cleaning, line_sep="\n")
10
+ clean(text, line_sep) { |t| expect(t).to eq(expected_after_cleaning) }
11
+ end
12
+
13
+ describe TextClean do
14
+ it "lowercases text" do
15
+ clean_eq("ALL UPPERCASE", "all uppercase")
16
+ end
17
+
18
+ it "compacts whitespace" do
19
+ clean_eq("is\t it so?", "is it so\n")
20
+ end
21
+
22
+ it "treats [,/&] as whitespace" do
23
+ clean_eq("a,bb&cc/d", "a bb cc d")
24
+ end
25
+
26
+ it "treats [;:!?] as sentence separators" do
27
+ clean_eq("x;y?z!:q", "x.y.z.q", ".")
28
+ end
29
+
30
+ it "joins hyphenated words at line end" do
31
+ clean_eq("satis-\nfaction", "satisfaction")
32
+ end
33
+
34
+ it "ignores whitespace after hyphen at line end" do
35
+ clean_eq("satis- \t \nfaction", "satisfaction")
36
+ end
37
+
38
+ it "treats a double hyphen as word separator" do
39
+ clean_eq("good--do it", "good do it")
40
+ end
41
+
42
+ it "compacts whitespace around double hyphen" do
43
+ clean_eq("good -- do it", "good do it")
44
+ end
45
+
46
+ it "treats newlines as word separator" do
47
+ clean_eq("a\nb\nc", "a b c")
48
+ end
49
+
50
+ it "ignores numbers" do
51
+ clean_eq("123abc", "abc")
52
+ end
53
+
54
+ it "ignores quotes and braces" do
55
+ clean_eq("[jim] \"speaks\"", "jim speaks")
56
+ end
57
+
58
+ it "keeps apostrophes" do
59
+ clean_eq("dad's", "dad's")
60
+ end
61
+ end
data/text_clean.gemspec CHANGED
@@ -24,4 +24,5 @@ Gem::Specification.new do |spec|
24
24
  spec.add_development_dependency "rake", "~> 10.3"
25
25
  spec.add_development_dependency "rake-compiler", "~> 0.9"
26
26
  spec.add_development_dependency "byebug", "~> 3.4"
27
+ spec.add_development_dependency "rspec", "~> 3.0"
27
28
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_clean
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duane Johnson
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '3.4'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
69
83
  description: Cleans text by removing punctuation, lowercasing. Very fast.
70
84
  email:
71
85
  - duane.johnson@gmail.com
@@ -76,6 +90,7 @@ extensions:
76
90
  - ext/text_clean/extconf.rb
77
91
  extra_rdoc_files: []
78
92
  files:
93
+ - ".gitignore"
79
94
  - Gemfile
80
95
  - Rakefile
81
96
  - bin/text_clean
@@ -84,6 +99,7 @@ files:
84
99
  - ext/text_clean/text_clean.cc
85
100
  - lib/text_clean.rb
86
101
  - lib/text_clean/version.rb
102
+ - spec/text_clean_spec.rb
87
103
  - text_clean.gemspec
88
104
  homepage: https://github.com/wordtreefoundation
89
105
  licenses:
@@ -110,4 +126,5 @@ rubygems_version: 2.2.2
110
126
  signing_key:
111
127
  specification_version: 4
112
128
  summary: Text cleaner
113
- test_files: []
129
+ test_files:
130
+ - spec/text_clean_spec.rb