wordtree 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,50 +13,32 @@ describe WordTree::Book do
13
13
  end
14
14
 
15
15
  it "can return cleaned content" do
16
- book = WordTree::Book.create("book", {}, "Wi&ld\nContent!")
17
- expect(book.content_clean).to eq("wild content\n")
16
+ content = "Wi&ld\nContent!"
17
+ book = WordTree::Book.create("book", {}, content)
18
+ expect(content).to eq("Wi&ld\nContent!")
19
+ expect(book.content_clean).to eq("wild content.")
18
20
  end
19
21
 
20
- context "ngrams" do
21
- let(:content) { "A man. A plan. And a man."}
22
- let(:book) { WordTree::Book.create("book", {}, content) }
23
- let(:one_grams) { { "a" => 3, "man" => 2, "plan" => 1, "and" => 1, "." => 3 } }
24
- let(:two_grams) {
25
- {"a man" => 2, "man ." => 2, ". a" => 1, "a plan" => 1,
26
- "plan ." => 1, ". and" => 1, "and a" => 1}
27
- }
28
- describe "#count_ngrams" do
29
- it "creates a hash lookup table" do
30
- hash = book.count_ngrams(1)
31
- expect(hash).to be_a(Hash)
32
- end
33
-
34
- it "has counts of ngrams" do
35
- hash = book.count_ngrams(1)
36
- expect(hash).to eq(one_grams)
37
- hash = book.count_ngrams(2)
38
- expect(hash).to eq(two_grams)
39
- end
40
-
41
- it "memoizes ngrams" do
42
- expect(book).to receive(:count_ngrams).with(1).and_return(one_grams)
43
- expect(book.ngrams(1)).to eq one_grams
44
- expect(book).to_not receive(:count_ngrams)
45
- expect(book.ngrams(1)).to eq one_grams
46
- end
47
- end
48
-
49
- describe "#set_ngrams" do
50
- it "sets the lookup hash" do
51
- book.set_ngrams(1, {"one" => 1})
52
- expect(book.ngrams(1)).to eq("one" => 1)
53
- expect(book.ngrams(2)).to eq(two_grams)
54
- end
55
-
56
- it "raises an error when not a hash" do
57
- expect{ book.set_ngrams(1, "string") }.to raise_error
58
- expect{ book.set_ngrams(1, nil) }.to raise_error
59
- end
60
- end
61
- end
22
+ # context "ngrams" do
23
+ # let(:content) { "A man. A plan. And a man."}
24
+ # let(:book) { WordTree::Book.create("book", {}, content) }
25
+ # let(:one_grams) { { "a" => 3, "man" => 2, "plan" => 1, "and" => 1, "." => 3 } }
26
+ # let(:two_grams) {
27
+ # {"a man" => 2, "man ." => 2, ". a" => 1, "a plan" => 1,
28
+ # "plan ." => 1, ". and" => 1, "and a" => 1}
29
+ # }
30
+ # describe "#ngrams" do
31
+ # it "creates a hash lookup table" do
32
+ # hash = book.count_ngrams(1)
33
+ # expect(hash).to be_a(Hash)
34
+ # end
35
+
36
+ # it "has counts of ngrams" do
37
+ # hash = book.count_ngrams(1)
38
+ # expect(hash).to eq(one_grams)
39
+ # hash = book.count_ngrams(2)
40
+ # expect(hash).to eq(two_grams)
41
+ # end
42
+ # end
43
+ # end
62
44
  end
@@ -32,12 +32,6 @@ describe WordTree::Disk::Librarian do
32
32
  expect(book.year).to eq(1800)
33
33
  expect(book.content).to eq("Book with content")
34
34
  end
35
-
36
- it "loads ngrams if available" do
37
- book = librarian.find("book")
38
- expect(book).to_not receive(:count_ngrams)
39
- expect(book.ngrams(1)).to eq("xyz" => 1)
40
- end
41
35
  end
42
36
 
43
37
  describe "#each" do
@@ -46,43 +40,5 @@ describe WordTree::Disk::Librarian do
46
40
  expect(book_sizes).to contain_exactly(17, 23)
47
41
  end
48
42
  end
49
-
50
- it "saves ngrams to disk" do
51
- tmp_root = Dir.mktmpdir
52
- tmp_library = WordTree::Disk::Library.new(tmp_root)
53
- tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
54
-
55
- book = librarian.find("book")
56
- book.ngrams(1)
57
- book.ngrams(2)
58
-
59
- tmp_librarian.save(book)
60
-
61
- ngrams_filepath = tmp_library.path_to("book", :ngrams, :n => 1)
62
- expect(File.exist?(ngrams_filepath)).to be_truthy
63
- end
64
-
65
- it "saves to disk (yaml, content)" do
66
- tmp_root = Dir.mktmpdir
67
- tmp_library = WordTree::Disk::Library.new(tmp_root)
68
- tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
69
-
70
- book = librarian.find_without_ngrams("book")
71
-
72
- book.source = "test"
73
- book.content += "."
74
-
75
- tmp_librarian.save(book)
76
-
77
- updated = Preamble.load(tmp_library.path_to("book"))
78
- expect(updated.metadata).to eq(
79
- :id => "book",
80
- :archive_org_id => "book",
81
- :year => 1800,
82
- :source => "test",
83
- :size_bytes => 17)
84
- expect(updated.content).to eq("Book with content.")
85
- end
86
-
87
43
  end
88
44
  end
@@ -0,0 +1,81 @@
1
+ require 'spec_helper'
2
+ require 'wordtree/text'
3
+ require 'timeout'
4
+
5
+ describe WordTree::Text do
6
+ context "#split_near" do
7
+ it "splits on spaces" do
8
+ line, rem = WordTree::Text.split_near("it is near", 7)
9
+ expect(line).to eq("it is")
10
+ expect(rem).to eq("near")
11
+ end
12
+
13
+ it "removes a space if index lands on one" do
14
+ line, rem = WordTree::Text.split_near("it is near", 5)
15
+ expect(line).to eq("it is")
16
+ expect(rem).to eq("near")
17
+ end
18
+
19
+ it "keeps the whole line if index is >= length of line" do
20
+ line, rem = WordTree::Text.split_near("it is near", 10)
21
+ expect(line).to eq("it is near")
22
+ expect(rem).to eq("")
23
+
24
+ line, rem = WordTree::Text.split_near("it is near", 11)
25
+ expect(line).to eq("it is near")
26
+ expect(rem).to eq("")
27
+ end
28
+
29
+ it "splits at the index anyway if no spaces are found" do
30
+ line, rem = WordTree::Text.split_near("itisnear", 4)
31
+ expect(line).to eq("itis")
32
+ expect(rem).to eq("near")
33
+ end
34
+ end
35
+
36
+ describe "#clean" do
37
+ it "wraps" do
38
+ sample_text = "This, [here] is awesome, right"
39
+ cleaned = WordTree::Text.clean(sample_text)
40
+ expect(cleaned).to eq("this here is awesome right")
41
+ end
42
+
43
+ it "joins lines ending in -" do
44
+ sample_text = "What-\never\ndo you\n mean?"
45
+ cleaned = WordTree::Text.clean(sample_text)
46
+ expect(cleaned).to eq("whatever do you mean.")
47
+ end
48
+
49
+ it "does not ignore sentence boundaries" do
50
+ sample_text = "This is a sentence. And so is this? Keep the dots."
51
+ cleaned = WordTree::Text.clean(sample_text)
52
+ expect(cleaned).to eq("this is a sentence.and so is this.keep the dots.")
53
+ end
54
+
55
+ it "compresses sentence boundary punctuation and spaces" do
56
+ sample_text = "words . . and.. stuff"
57
+ cleaned = WordTree::Text.clean(sample_text)
58
+ expect(cleaned).to eq("words.and.stuff")
59
+ end
60
+ end
61
+
62
+ describe "#common_trigrams" do
63
+ it "returns 0 for strings of len < 3" do
64
+ expect(WordTree::Text.common_trigrams("")).to eq 0
65
+ expect(WordTree::Text.common_trigrams("1")).to eq 0
66
+ expect(WordTree::Text.common_trigrams("12")).to eq 0
67
+ end
68
+
69
+ it "returns 0 for strings without common trigrams" do
70
+ expect(WordTree::Text.common_trigrams("!{*@*!()}")).to eq 0
71
+ expect(WordTree::Text.common_trigrams("qwrtypzzx")).to eq 0
72
+ expect(WordTree::Text.common_trigrams(" ")).to eq 0
73
+ end
74
+
75
+ it "returns correct counts for strings with trigrams" do
76
+ expect(WordTree::Text.common_trigrams("what")).to eq 1
77
+ expect(WordTree::Text.common_trigrams("the wall")).to eq 2
78
+ end
79
+ end
80
+
81
+ end
data/wordtree.gemspec CHANGED
@@ -5,12 +5,12 @@ require 'wordtree/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = "wordtree"
8
- spec.version = Wordtree::VERSION
8
+ spec.version = WordTree::VERSION
9
9
  spec.authors = ["Duane Johnson"]
10
10
  spec.email = ["duane.johnson@gmail.com"]
11
11
  spec.description = %q{WordTree common library code}
12
- spec.summary = %q{Wordtree common library code}
13
- spec.homepage = ""
12
+ spec.summary = %q{WordTree common library code}
13
+ spec.homepage = "https://github.com/wordtreefoundation/wordtree-ruby"
14
14
  spec.license = "MIT"
15
15
 
16
16
  spec.files = `git ls-files`.split($/)
@@ -18,19 +18,21 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_dependency "virtus"
22
- spec.add_dependency "preamble", ">= 0.0.3"
23
- spec.add_dependency "archivist-client", ">= 0.1.7"
24
- spec.add_dependency "retriable"
21
+ spec.extensions = %w[ext/extconf.rb]
22
+
23
+ spec.add_dependency "virtus", "~> 1.0"
24
+ spec.add_dependency "preamble", "0.0.3"
25
+ spec.add_dependency "archivist-client", "0.1.7"
26
+ spec.add_dependency "retriable", "1.4.1"
25
27
  spec.add_dependency "simhash", "0.2.5"
26
- spec.add_dependency "rethinkdb", "~> 1.14.0"
28
+ spec.add_dependency "rethinkdb", "~> 1.14"
27
29
 
28
30
  spec.add_development_dependency "bundler", "~> 1.3"
29
- spec.add_development_dependency "rake"
30
- spec.add_development_dependency "debugger"
31
- spec.add_development_dependency "rspec"
32
- spec.add_development_dependency "guard"
33
- spec.add_development_dependency "guard-rspec"
34
- spec.add_development_dependency "vcr"
35
- spec.add_development_dependency "webmock"
31
+ spec.add_development_dependency "rake", "~> 10.3"
32
+ spec.add_development_dependency "byebug", "~> 3.4"
33
+ spec.add_development_dependency "rspec", "~> 3.1"
34
+ spec.add_development_dependency "guard", "~> 2.6"
35
+ spec.add_development_dependency "guard-rspec", "~> 4.3"
36
+ spec.add_development_dependency "vcr", "~> 2.9"
37
+ spec.add_development_dependency "webmock", "~> 1.18"
36
38
  end
metadata CHANGED
@@ -1,84 +1,74 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
5
- prerelease:
4
+ version: 0.4.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Duane Johnson
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2014-09-08 00:00:00.000000000 Z
11
+ date: 2014-09-26 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: virtus
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ! '>='
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
- version: '0'
19
+ version: '1.0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ! '>='
24
+ - - "~>"
28
25
  - !ruby/object:Gem::Version
29
- version: '0'
26
+ version: '1.0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: preamble
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '='
36
32
  - !ruby/object:Gem::Version
37
33
  version: 0.0.3
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '='
44
39
  - !ruby/object:Gem::Version
45
40
  version: 0.0.3
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: archivist-client
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ! '>='
45
+ - - '='
52
46
  - !ruby/object:Gem::Version
53
47
  version: 0.1.7
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ! '>='
52
+ - - '='
60
53
  - !ruby/object:Gem::Version
61
54
  version: 0.1.7
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: retriable
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
- - - ! '>='
59
+ - - '='
68
60
  - !ruby/object:Gem::Version
69
- version: '0'
61
+ version: 1.4.1
70
62
  type: :runtime
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
- - - ! '>='
66
+ - - '='
76
67
  - !ruby/object:Gem::Version
77
- version: '0'
68
+ version: 1.4.1
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: simhash
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
73
  - - '='
84
74
  - !ruby/object:Gem::Version
@@ -86,7 +76,6 @@ dependencies:
86
76
  type: :runtime
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
80
  - - '='
92
81
  - !ruby/object:Gem::Version
@@ -94,169 +83,157 @@ dependencies:
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: rethinkdb
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
- - - ~>
87
+ - - "~>"
100
88
  - !ruby/object:Gem::Version
101
- version: 1.14.0
89
+ version: '1.14'
102
90
  type: :runtime
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
- - - ~>
94
+ - - "~>"
108
95
  - !ruby/object:Gem::Version
109
- version: 1.14.0
96
+ version: '1.14'
110
97
  - !ruby/object:Gem::Dependency
111
98
  name: bundler
112
99
  requirement: !ruby/object:Gem::Requirement
113
- none: false
114
100
  requirements:
115
- - - ~>
101
+ - - "~>"
116
102
  - !ruby/object:Gem::Version
117
103
  version: '1.3'
118
104
  type: :development
119
105
  prerelease: false
120
106
  version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
107
  requirements:
123
- - - ~>
108
+ - - "~>"
124
109
  - !ruby/object:Gem::Version
125
110
  version: '1.3'
126
111
  - !ruby/object:Gem::Dependency
127
112
  name: rake
128
113
  requirement: !ruby/object:Gem::Requirement
129
- none: false
130
114
  requirements:
131
- - - ! '>='
115
+ - - "~>"
132
116
  - !ruby/object:Gem::Version
133
- version: '0'
117
+ version: '10.3'
134
118
  type: :development
135
119
  prerelease: false
136
120
  version_requirements: !ruby/object:Gem::Requirement
137
- none: false
138
121
  requirements:
139
- - - ! '>='
122
+ - - "~>"
140
123
  - !ruby/object:Gem::Version
141
- version: '0'
124
+ version: '10.3'
142
125
  - !ruby/object:Gem::Dependency
143
- name: debugger
126
+ name: byebug
144
127
  requirement: !ruby/object:Gem::Requirement
145
- none: false
146
128
  requirements:
147
- - - ! '>='
129
+ - - "~>"
148
130
  - !ruby/object:Gem::Version
149
- version: '0'
131
+ version: '3.4'
150
132
  type: :development
151
133
  prerelease: false
152
134
  version_requirements: !ruby/object:Gem::Requirement
153
- none: false
154
135
  requirements:
155
- - - ! '>='
136
+ - - "~>"
156
137
  - !ruby/object:Gem::Version
157
- version: '0'
138
+ version: '3.4'
158
139
  - !ruby/object:Gem::Dependency
159
140
  name: rspec
160
141
  requirement: !ruby/object:Gem::Requirement
161
- none: false
162
142
  requirements:
163
- - - ! '>='
143
+ - - "~>"
164
144
  - !ruby/object:Gem::Version
165
- version: '0'
145
+ version: '3.1'
166
146
  type: :development
167
147
  prerelease: false
168
148
  version_requirements: !ruby/object:Gem::Requirement
169
- none: false
170
149
  requirements:
171
- - - ! '>='
150
+ - - "~>"
172
151
  - !ruby/object:Gem::Version
173
- version: '0'
152
+ version: '3.1'
174
153
  - !ruby/object:Gem::Dependency
175
154
  name: guard
176
155
  requirement: !ruby/object:Gem::Requirement
177
- none: false
178
156
  requirements:
179
- - - ! '>='
157
+ - - "~>"
180
158
  - !ruby/object:Gem::Version
181
- version: '0'
159
+ version: '2.6'
182
160
  type: :development
183
161
  prerelease: false
184
162
  version_requirements: !ruby/object:Gem::Requirement
185
- none: false
186
163
  requirements:
187
- - - ! '>='
164
+ - - "~>"
188
165
  - !ruby/object:Gem::Version
189
- version: '0'
166
+ version: '2.6'
190
167
  - !ruby/object:Gem::Dependency
191
168
  name: guard-rspec
192
169
  requirement: !ruby/object:Gem::Requirement
193
- none: false
194
170
  requirements:
195
- - - ! '>='
171
+ - - "~>"
196
172
  - !ruby/object:Gem::Version
197
- version: '0'
173
+ version: '4.3'
198
174
  type: :development
199
175
  prerelease: false
200
176
  version_requirements: !ruby/object:Gem::Requirement
201
- none: false
202
177
  requirements:
203
- - - ! '>='
178
+ - - "~>"
204
179
  - !ruby/object:Gem::Version
205
- version: '0'
180
+ version: '4.3'
206
181
  - !ruby/object:Gem::Dependency
207
182
  name: vcr
208
183
  requirement: !ruby/object:Gem::Requirement
209
- none: false
210
184
  requirements:
211
- - - ! '>='
185
+ - - "~>"
212
186
  - !ruby/object:Gem::Version
213
- version: '0'
187
+ version: '2.9'
214
188
  type: :development
215
189
  prerelease: false
216
190
  version_requirements: !ruby/object:Gem::Requirement
217
- none: false
218
191
  requirements:
219
- - - ! '>='
192
+ - - "~>"
220
193
  - !ruby/object:Gem::Version
221
- version: '0'
194
+ version: '2.9'
222
195
  - !ruby/object:Gem::Dependency
223
196
  name: webmock
224
197
  requirement: !ruby/object:Gem::Requirement
225
- none: false
226
198
  requirements:
227
- - - ! '>='
199
+ - - "~>"
228
200
  - !ruby/object:Gem::Version
229
- version: '0'
201
+ version: '1.18'
230
202
  type: :development
231
203
  prerelease: false
232
204
  version_requirements: !ruby/object:Gem::Requirement
233
- none: false
234
205
  requirements:
235
- - - ! '>='
206
+ - - "~>"
236
207
  - !ruby/object:Gem::Version
237
- version: '0'
208
+ version: '1.18'
238
209
  description: WordTree common library code
239
210
  email:
240
211
  - duane.johnson@gmail.com
241
212
  executables: []
242
- extensions: []
213
+ extensions:
214
+ - ext/extconf.rb
243
215
  extra_rdoc_files: []
244
216
  files:
245
- - .gitignore
246
- - .rspec
217
+ - ".gitignore"
218
+ - ".rspec"
247
219
  - Gemfile
248
220
  - Guardfile
249
221
  - LICENSE.txt
250
222
  - README.md
251
223
  - Rakefile
224
+ - ext/Makefile
225
+ - ext/extconf.rb
226
+ - ext/wordtree.cc
252
227
  - lib/wordtree.rb
253
228
  - lib/wordtree/archdown.rb
254
229
  - lib/wordtree/book.rb
230
+ - lib/wordtree/book_list.rb
255
231
  - lib/wordtree/db/librarian.rb
256
232
  - lib/wordtree/disk/librarian.rb
257
233
  - lib/wordtree/disk/library.rb
258
234
  - lib/wordtree/disk/library_locator.rb
259
- - lib/wordtree/text_utils.rb
235
+ - lib/wordtree/ngrams.rb
236
+ - lib/wordtree/text.rb
260
237
  - lib/wordtree/version.rb
261
238
  - spec/fixtures/cassettes/archive_org_download_book.yml
262
239
  - spec/fixtures/library/bo/ok/book/book.1grams.json
@@ -268,33 +245,32 @@ files:
268
245
  - spec/wordtree/disk/librarian_spec.rb
269
246
  - spec/wordtree/disk/library_locator_spec.rb
270
247
  - spec/wordtree/disk/library_spec.rb
271
- - spec/wordtree/text_utils_spec.rb
248
+ - spec/wordtree/text_spec.rb
272
249
  - wordtree.gemspec
273
- homepage: ''
250
+ homepage: https://github.com/wordtreefoundation/wordtree-ruby
274
251
  licenses:
275
252
  - MIT
253
+ metadata: {}
276
254
  post_install_message:
277
255
  rdoc_options: []
278
256
  require_paths:
279
257
  - lib
280
258
  required_ruby_version: !ruby/object:Gem::Requirement
281
- none: false
282
259
  requirements:
283
- - - ! '>='
260
+ - - ">="
284
261
  - !ruby/object:Gem::Version
285
262
  version: '0'
286
263
  required_rubygems_version: !ruby/object:Gem::Requirement
287
- none: false
288
264
  requirements:
289
- - - ! '>='
265
+ - - ">="
290
266
  - !ruby/object:Gem::Version
291
267
  version: '0'
292
268
  requirements: []
293
269
  rubyforge_project:
294
- rubygems_version: 1.8.23
270
+ rubygems_version: 2.2.2
295
271
  signing_key:
296
- specification_version: 3
297
- summary: Wordtree common library code
272
+ specification_version: 4
273
+ summary: WordTree common library code
298
274
  test_files:
299
275
  - spec/fixtures/cassettes/archive_org_download_book.yml
300
276
  - spec/fixtures/library/bo/ok/book/book.1grams.json
@@ -306,5 +282,4 @@ test_files:
306
282
  - spec/wordtree/disk/librarian_spec.rb
307
283
  - spec/wordtree/disk/library_locator_spec.rb
308
284
  - spec/wordtree/disk/library_spec.rb
309
- - spec/wordtree/text_utils_spec.rb
310
- has_rdoc:
285
+ - spec/wordtree/text_spec.rb