text_clean 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 73936fded6a20b6fdde25a06bd5cb74b830177c1
4
- data.tar.gz: 7fc77b21e64fad053e1e56f1245ddf70d7b59195
3
+ metadata.gz: 2e529b66807a01385f10fc896e173880ebe10925
4
+ data.tar.gz: d4c095af593c650788cc54795ca12a76c695a652
5
5
  SHA512:
6
- metadata.gz: 596c250db5028aaaaa71c7290e160706e08365b63ba6d9b9aed79ecc03ccea965e6380dfe7709e578fd4ac6f32a733c29d6d07bfdee60a3a41d863bd94bf936f
7
- data.tar.gz: 399449bc2bb5c8fafed2a435446488592ad62eb05eda659d1dd4f8338c07280091bcf604f482725b46367b3daa7e1baac73b16530c02d5950f26cfb9f03ca3c3
6
+ metadata.gz: 8c45e5724b4f8e5598fdeee84869564b737e9ac77ff30c08e756fec9adec5f148f150cf5026dacd0dd16e011357ff02c02d4ceeb61b0a3725400b7e23a98b786
7
+ data.tar.gz: 15ff835dff5b83c95e71d87e4d483ac54bf9978c8c66d38e67d738b577c0a24a140c5e38528fd5fd0c6af8f60a5c5ddb057b40c43012cc0bf845d37fcd6a1c7d
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ Gemfile.lock
2
+ lib/text_clean/text_clean.bundle
3
+ lib/text_clean/text_clean.so
4
+ *.gem
5
+ tmp/
data/Rakefile CHANGED
@@ -1,3 +1,14 @@
1
+
2
+ require 'rspec'
3
+ require 'rspec/core/rake_task'
4
+
5
+ # `rake test`
6
+ desc 'Run all examples'
7
+ RSpec::Core::RakeTask.new(:test) do |t|
8
+ t.pattern = 'spec/**/*_spec.rb'
9
+ end
10
+
11
+ # `rake compile`
1
12
  require "rake/extensiontask"
2
13
 
3
14
  Rake::ExtensionTask.new "text_clean" do |ext|
@@ -52,7 +52,7 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
52
52
  just_added_period = false;
53
53
  }
54
54
  } else {
55
- // scan ahead to see if this is a hyphen at the end of the line
55
+ // scan ahead to see if this hyphen is at the end of the line
56
56
  char* scan_ahead;
57
57
  for (scan_ahead = read + 1; scan_ahead < eos; scan_ahead++) {
58
58
  char s = *scan_ahead;
@@ -74,11 +74,11 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
74
74
  *write++ = line_sep;
75
75
  just_added_period = true;
76
76
  just_added_space = false;
77
- } else if (c == ' ' && !just_added_space && !just_added_period) {
77
+ } else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_period) {
78
78
  *write++ = ' ';
79
79
  just_added_space = true;
80
80
  just_added_period = false;
81
- } else if (c >= 'a' && c <= 'z') {
81
+ } else if (c == '\'' || (c >= 'a' && c <= 'z')) {
82
82
  *write++ = c;
83
83
  just_added_space = false;
84
84
  just_added_period = false;
@@ -1,3 +1,3 @@
1
1
  module TextClean
2
- VERSION = '0.1.1'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -0,0 +1,61 @@
1
+ require 'rspec'
2
+ require 'text_clean'
3
+
4
+ def clean(text, line_sep="\n", &block)
5
+ TextClean.clean(text, line_sep)
6
+ block.call(text)
7
+ end
8
+
9
+ def clean_eq(text, expected_after_cleaning, line_sep="\n")
10
+ clean(text, line_sep) { |t| expect(t).to eq(expected_after_cleaning) }
11
+ end
12
+
13
+ describe TextClean do
14
+ it "lowercases text" do
15
+ clean_eq("ALL UPPERCASE", "all uppercase")
16
+ end
17
+
18
+ it "compacts whitespace" do
19
+ clean_eq("is\t it so?", "is it so\n")
20
+ end
21
+
22
+ it "treats [,/&] as whitespace" do
23
+ clean_eq("a,bb&cc/d", "a bb cc d")
24
+ end
25
+
26
+ it "treats [;:!?] as sentence separators" do
27
+ clean_eq("x;y?z!:q", "x.y.z.q", ".")
28
+ end
29
+
30
+ it "joins hyphenated words at line end" do
31
+ clean_eq("satis-\nfaction", "satisfaction")
32
+ end
33
+
34
+ it "ignores whitespace after hyphen at line end" do
35
+ clean_eq("satis- \t \nfaction", "satisfaction")
36
+ end
37
+
38
+ it "treats a double hyphen as word separator" do
39
+ clean_eq("good--do it", "good do it")
40
+ end
41
+
42
+ it "compacts whitespace around double hyphen" do
43
+ clean_eq("good -- do it", "good do it")
44
+ end
45
+
46
+ it "treats newlines as word separator" do
47
+ clean_eq("a\nb\nc", "a b c")
48
+ end
49
+
50
+ it "ignores numbers" do
51
+ clean_eq("123abc", "abc")
52
+ end
53
+
54
+ it "ignores quotes and braces" do
55
+ clean_eq("[jim] \"speaks\"", "jim speaks")
56
+ end
57
+
58
+ it "keeps apostrophes" do
59
+ clean_eq("dad's", "dad's")
60
+ end
61
+ end
data/text_clean.gemspec CHANGED
@@ -24,4 +24,5 @@ Gem::Specification.new do |spec|
24
24
  spec.add_development_dependency "rake", "~> 10.3"
25
25
  spec.add_development_dependency "rake-compiler", "~> 0.9"
26
26
  spec.add_development_dependency "byebug", "~> 3.4"
27
+ spec.add_development_dependency "rspec", "~> 3.0"
27
28
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_clean
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duane Johnson
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '3.4'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
69
83
  description: Cleans text by removing punctuation, lowercasing. Very fast.
70
84
  email:
71
85
  - duane.johnson@gmail.com
@@ -76,6 +90,7 @@ extensions:
76
90
  - ext/text_clean/extconf.rb
77
91
  extra_rdoc_files: []
78
92
  files:
93
+ - ".gitignore"
79
94
  - Gemfile
80
95
  - Rakefile
81
96
  - bin/text_clean
@@ -84,6 +99,7 @@ files:
84
99
  - ext/text_clean/text_clean.cc
85
100
  - lib/text_clean.rb
86
101
  - lib/text_clean/version.rb
102
+ - spec/text_clean_spec.rb
87
103
  - text_clean.gemspec
88
104
  homepage: https://github.com/wordtreefoundation
89
105
  licenses:
@@ -110,4 +126,5 @@ rubygems_version: 2.2.2
110
126
  signing_key:
111
127
  specification_version: 4
112
128
  summary: Text cleaner
113
- test_files: []
129
+ test_files:
130
+ - spec/text_clean_spec.rb