wordtree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/wordtree/book.rb +38 -1
- data/lib/wordtree/disk/librarian.rb +31 -2
- data/lib/wordtree/disk/library.rb +8 -5
- data/lib/wordtree/text_utils.rb +31 -0
- data/lib/wordtree/version.rb +1 -1
- data/spec/fixtures/library/bo/ok/book/book.1grams.json +3 -0
- data/spec/wordtree/book_spec.rb +43 -0
- data/spec/wordtree/disk/librarian_spec.rb +22 -1
- data/spec/wordtree/disk/library_spec.rb +6 -0
- data/spec/wordtree/text_utils_spec.rb +30 -4
- metadata +3 -1
data/lib/wordtree/book.rb
CHANGED
@@ -21,6 +21,11 @@ module WordTree
|
|
21
21
|
|
22
22
|
attribute :content, String
|
23
23
|
|
24
|
+
def initialize(*args)
|
25
|
+
super
|
26
|
+
@ngrams = {}
|
27
|
+
end
|
28
|
+
|
24
29
|
def self.create(id, metadata, content)
|
25
30
|
new(metadata.merge("id" => id, "content" => content))
|
26
31
|
end
|
@@ -34,13 +39,45 @@ module WordTree
|
|
34
39
|
end
|
35
40
|
|
36
41
|
def content_clean(wrap=120)
|
37
|
-
|
42
|
+
if @content_clean_wrap != wrap
|
43
|
+
# Memoize content_clean (using last wrap size)
|
44
|
+
@content_clean_wrap = wrap
|
45
|
+
@content_clean = TextUtils.clean_text(content, wrap)
|
46
|
+
end
|
47
|
+
@content_clean
|
38
48
|
end
|
39
49
|
|
40
50
|
def content_size
|
41
51
|
content ? content.size : nil
|
42
52
|
end
|
43
53
|
|
54
|
+
def each_ngram(n=1, &block)
|
55
|
+
TextUtils.each_ngram(content_clean, n, &block)
|
56
|
+
end
|
57
|
+
|
58
|
+
def set_ngrams(n, lookup)
|
59
|
+
raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
|
60
|
+
@ngrams[n] = lookup
|
61
|
+
end
|
62
|
+
|
63
|
+
def ngrams(n=1)
|
64
|
+
# Memoize ngram counts
|
65
|
+
@ngrams[n] ||= count_ngrams(n)
|
66
|
+
end
|
67
|
+
|
68
|
+
def all_ngrams
|
69
|
+
@ngrams
|
70
|
+
end
|
71
|
+
|
72
|
+
def count_ngrams(n=1)
|
73
|
+
{}.tap do |tally|
|
74
|
+
each_ngram(n) do |ngram|
|
75
|
+
tally[ngram] ||= 0
|
76
|
+
tally[ngram] += 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
44
81
|
def calculate_simhash
|
45
82
|
content ? content_clean.simhash(:split_by => /\s/) : nil
|
46
83
|
end
|
@@ -20,7 +20,7 @@ module WordTree
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def find_without_ngrams(book_id)
|
24
24
|
begin
|
25
25
|
retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
|
26
26
|
Book.create(book_id, retrieved.metadata, retrieved.content)
|
@@ -29,6 +29,20 @@ module WordTree
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
+
def find(book_id)
|
33
|
+
find_without_ngrams(book_id).tap do |book|
|
34
|
+
(1..9).each do |n|
|
35
|
+
path = library.path_to(book_id, :ngrams, :n => n)
|
36
|
+
if File.exist?(path)
|
37
|
+
File.open(path) do |f|
|
38
|
+
hash = JSON.load(f)
|
39
|
+
book.set_ngrams(n, hash)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
32
46
|
def each(file_suffix_re=/\.(md|txt)$/, &block)
|
33
47
|
library.each(file_suffix_re) do |path|
|
34
48
|
retrieved = Preamble.load(path, :external_encoding => "utf-8")
|
@@ -36,11 +50,26 @@ module WordTree
|
|
36
50
|
end
|
37
51
|
end
|
38
52
|
|
39
|
-
def
|
53
|
+
def save_without_ngrams(book)
|
40
54
|
library.mkdir(book.id)
|
41
55
|
Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
|
42
56
|
end
|
43
57
|
|
58
|
+
def save_ngrams(book)
|
59
|
+
book.all_ngrams.each_pair do |n, hash|
|
60
|
+
path = library.path_to(book.id, :ngrams, :n => n)
|
61
|
+
File.open(path, "w") do |file|
|
62
|
+
file.write hash.to_json
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def save(book)
|
68
|
+
save_without_ngrams(book).tap do
|
69
|
+
save_ngrams(book)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
44
73
|
def archive_org_get(*book_ids, &block)
|
45
74
|
book_ids.map do |book_id|
|
46
75
|
archive_org_get_with_conditions(identifier: book_id, &block)
|
@@ -10,7 +10,8 @@ module WordTree
|
|
10
10
|
include Enumerable
|
11
11
|
|
12
12
|
FILE_TYPES = {
|
13
|
-
:raw => "%
|
13
|
+
:raw => "%{id}.md",
|
14
|
+
:ngrams => "%{id}.%{n}grams.json"
|
14
15
|
}
|
15
16
|
|
16
17
|
# The file path to the root of the library directory, e.g. /data/library
|
@@ -26,13 +27,15 @@ module WordTree
|
|
26
27
|
File.expand_path(LibraryLocator.identity(book_id).relpath, root)
|
27
28
|
end
|
28
29
|
|
29
|
-
def path_to(book_id, type=:raw)
|
30
|
-
File.join(dir_of(book_id), file_type(book_id, type))
|
30
|
+
def path_to(book_id, type=:raw, opts={})
|
31
|
+
File.join(dir_of(book_id), file_type(book_id, type, opts))
|
31
32
|
end
|
32
33
|
|
33
|
-
def file_type(book_id, type=:raw)
|
34
|
+
def file_type(book_id, type=:raw, opts={})
|
34
35
|
locator = LibraryLocator.identity(book_id)
|
35
|
-
FILE_TYPES[type]
|
36
|
+
template = FILE_TYPES[type]
|
37
|
+
raise ArgumentError, "unable to find file type template #{type.inspect}" if template.nil?
|
38
|
+
template % {:id => locator.id}.merge(opts)
|
36
39
|
end
|
37
40
|
|
38
41
|
# Create all subdirs up to the location where a book is stored
|
data/lib/wordtree/text_utils.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
1
3
|
module WordTree
|
2
4
|
module TextUtils
|
3
5
|
def self.split_near(text, split_index)
|
@@ -34,9 +36,12 @@ module WordTree
|
|
34
36
|
_dash = '-'.ord
|
35
37
|
_space = ' '.ord
|
36
38
|
_newline = "\n".ord
|
39
|
+
_period = '.'.ord
|
40
|
+
_question = '?'.ord
|
37
41
|
|
38
42
|
join_lines = false
|
39
43
|
just_added_space = false
|
44
|
+
just_added_period = false
|
40
45
|
line_length = 0
|
41
46
|
input.each_char do |c|
|
42
47
|
c = c.ord
|
@@ -44,17 +49,28 @@ module WordTree
|
|
44
49
|
c -= 32 if (c >= _A && c <= _Z)
|
45
50
|
# Change newlines to spaces
|
46
51
|
c = _space if c == _newline
|
52
|
+
# Change question marks to periods (i.e. both count as sentence boundaries)
|
53
|
+
c = _period if c == _question
|
47
54
|
|
48
55
|
if c == _dash
|
49
56
|
# In case of a dash, set the scoop-spaces-up flag
|
50
57
|
join_lines = true
|
51
58
|
elsif join_lines && (c == _space)
|
52
59
|
# ignore
|
60
|
+
elsif (c == _period) && !just_added_period
|
61
|
+
if !just_added_space
|
62
|
+
output << _space.chr
|
63
|
+
end
|
64
|
+
output << c.chr
|
65
|
+
just_added_period = true
|
66
|
+
just_added_space = true
|
53
67
|
elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
|
54
68
|
# Add letters and spaces
|
69
|
+
output << _space.chr if just_added_period
|
55
70
|
output << c.chr
|
56
71
|
line_length += 1
|
57
72
|
just_added_space = (c == _space)
|
73
|
+
just_added_period = false
|
58
74
|
join_lines = false
|
59
75
|
end
|
60
76
|
end
|
@@ -69,5 +85,20 @@ module WordTree
|
|
69
85
|
|
70
86
|
return wrapped_output
|
71
87
|
end
|
88
|
+
|
89
|
+
def self.each_ngram(input, n=1, &block)
|
90
|
+
onegram_re = /([^ \n]+[ \n])/
|
91
|
+
ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
|
92
|
+
s = StringScanner.new(input)
|
93
|
+
while !s.eos?
|
94
|
+
if words = s.scan(ngram_re)
|
95
|
+
yield words.rstrip.tr("\n", " ") if block_given?
|
96
|
+
# Move back to beginning of n-word sequence
|
97
|
+
s.unscan
|
98
|
+
end
|
99
|
+
# Move forward one word
|
100
|
+
s.scan(onegram_re)
|
101
|
+
end
|
102
|
+
end
|
72
103
|
end
|
73
104
|
end
|
data/lib/wordtree/version.rb
CHANGED
data/spec/wordtree/book_spec.rb
CHANGED
@@ -16,4 +16,47 @@ describe WordTree::Book do
|
|
16
16
|
book = WordTree::Book.create("book", {}, "Wi&ld\nContent!")
|
17
17
|
expect(book.content_clean).to eq("wild content\n")
|
18
18
|
end
|
19
|
+
|
20
|
+
context "ngrams" do
|
21
|
+
let(:content) { "A man. A plan. And a man."}
|
22
|
+
let(:book) { WordTree::Book.create("book", {}, content) }
|
23
|
+
let(:one_grams) { { "a" => 3, "man" => 2, "plan" => 1, "and" => 1, "." => 3 } }
|
24
|
+
let(:two_grams) {
|
25
|
+
{"a man" => 2, "man ." => 2, ". a" => 1, "a plan" => 1,
|
26
|
+
"plan ." => 1, ". and" => 1, "and a" => 1}
|
27
|
+
}
|
28
|
+
describe "#count_ngrams" do
|
29
|
+
it "creates a hash lookup table" do
|
30
|
+
hash = book.count_ngrams(1)
|
31
|
+
expect(hash).to be_a(Hash)
|
32
|
+
end
|
33
|
+
|
34
|
+
it "has counts of ngrams" do
|
35
|
+
hash = book.count_ngrams(1)
|
36
|
+
expect(hash).to eq(one_grams)
|
37
|
+
hash = book.count_ngrams(2)
|
38
|
+
expect(hash).to eq(two_grams)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "memoizes ngrams" do
|
42
|
+
expect(book).to receive(:count_ngrams).with(1).and_return(one_grams)
|
43
|
+
expect(book.ngrams(1)).to eq one_grams
|
44
|
+
expect(book).to_not receive(:count_ngrams)
|
45
|
+
expect(book.ngrams(1)).to eq one_grams
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe "#set_ngrams" do
|
50
|
+
it "sets the lookup hash" do
|
51
|
+
book.set_ngrams(1, {"one" => 1})
|
52
|
+
expect(book.ngrams(1)).to eq("one" => 1)
|
53
|
+
expect(book.ngrams(2)).to eq(two_grams)
|
54
|
+
end
|
55
|
+
|
56
|
+
it "raises an error when not a hash" do
|
57
|
+
expect{ book.set_ngrams(1, "string") }.to raise_error
|
58
|
+
expect{ book.set_ngrams(1, nil) }.to raise_error
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
19
62
|
end
|
@@ -32,6 +32,12 @@ describe WordTree::Disk::Librarian do
|
|
32
32
|
expect(book.year).to eq(1800)
|
33
33
|
expect(book.content).to eq("Book with content")
|
34
34
|
end
|
35
|
+
|
36
|
+
it "loads ngrams if available" do
|
37
|
+
book = librarian.find("book")
|
38
|
+
expect(book).to_not receive(:count_ngrams)
|
39
|
+
expect(book.ngrams(1)).to eq("xyz" => 1)
|
40
|
+
end
|
35
41
|
end
|
36
42
|
|
37
43
|
describe "#each" do
|
@@ -41,12 +47,27 @@ describe WordTree::Disk::Librarian do
|
|
41
47
|
end
|
42
48
|
end
|
43
49
|
|
44
|
-
it "saves to disk
|
50
|
+
it "saves ngrams to disk" do
|
45
51
|
tmp_root = Dir.mktmpdir
|
46
52
|
tmp_library = WordTree::Disk::Library.new(tmp_root)
|
47
53
|
tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
|
48
54
|
|
49
55
|
book = librarian.find("book")
|
56
|
+
book.ngrams(1)
|
57
|
+
book.ngrams(2)
|
58
|
+
|
59
|
+
tmp_librarian.save(book)
|
60
|
+
|
61
|
+
ngrams_filepath = tmp_library.path_to("book", :ngrams, :n => 1)
|
62
|
+
expect(File.exist?(ngrams_filepath)).to be_truthy
|
63
|
+
end
|
64
|
+
|
65
|
+
it "saves to disk (yaml, content)" do
|
66
|
+
tmp_root = Dir.mktmpdir
|
67
|
+
tmp_library = WordTree::Disk::Library.new(tmp_root)
|
68
|
+
tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
|
69
|
+
|
70
|
+
book = librarian.find_without_ngrams("book")
|
50
71
|
|
51
72
|
book.source = "test"
|
52
73
|
book.content += "."
|
@@ -33,8 +33,8 @@ describe WordTree::TextUtils do
|
|
33
33
|
end
|
34
34
|
|
35
35
|
context "#clean_text" do
|
36
|
-
let(:sample_text) { "This, [here] is awesome, right?" }
|
37
36
|
it "wraps" do
|
37
|
+
sample_text = "This, [here] is awesome, right"
|
38
38
|
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
39
39
|
expect(cleaned).to eq("this here\nis awesome\nright\n")
|
40
40
|
|
@@ -45,10 +45,36 @@ describe WordTree::TextUtils do
|
|
45
45
|
expect(cleaned).to eq("this here is awesome right\n")
|
46
46
|
end
|
47
47
|
|
48
|
-
let(:sample_dash) { "What-\never\ndo you\n mean?"}
|
49
48
|
it "joins lines ending in -" do
|
50
|
-
|
51
|
-
|
49
|
+
sample_text = "What-\never\ndo you\n mean?"
|
50
|
+
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
51
|
+
expect(cleaned).to eq("whatever\ndo you\nmean .\n")
|
52
|
+
end
|
53
|
+
|
54
|
+
it "does not ignore sentence boundaries" do
|
55
|
+
sample_text = "This is a sentence. And so is this? Keep the dots."
|
56
|
+
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
57
|
+
expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
|
58
|
+
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
59
|
+
expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
|
60
|
+
end
|
61
|
+
|
62
|
+
it "compresses sentence boundary punctuation and spaces" do
|
63
|
+
sample_text = "words . . and.. stuff"
|
64
|
+
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
65
|
+
expect(cleaned).to eq("words . and . stuff\n")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
context "#each_ngram" do
|
70
|
+
it "yields ngrams in succession" do
|
71
|
+
sample_text = "one word\n. two\n"
|
72
|
+
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
|
73
|
+
yield_successive_args("one", "word", ".", "two")
|
74
|
+
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
|
75
|
+
yield_successive_args("one word", "word .", ". two")
|
76
|
+
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
|
77
|
+
yield_successive_args("one word .", "word . two")
|
52
78
|
end
|
53
79
|
end
|
54
80
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordtree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -259,6 +259,7 @@ files:
|
|
259
259
|
- lib/wordtree/text_utils.rb
|
260
260
|
- lib/wordtree/version.rb
|
261
261
|
- spec/fixtures/cassettes/archive_org_download_book.yml
|
262
|
+
- spec/fixtures/library/bo/ok/book/book.1grams.json
|
262
263
|
- spec/fixtures/library/bo/ok/book/book.md
|
263
264
|
- spec/fixtures/library/ot/er/other/other.md
|
264
265
|
- spec/spec_helper.rb
|
@@ -296,6 +297,7 @@ specification_version: 3
|
|
296
297
|
summary: Wordtree common library code
|
297
298
|
test_files:
|
298
299
|
- spec/fixtures/cassettes/archive_org_download_book.yml
|
300
|
+
- spec/fixtures/library/bo/ok/book/book.1grams.json
|
299
301
|
- spec/fixtures/library/bo/ok/book/book.md
|
300
302
|
- spec/fixtures/library/ot/er/other/other.md
|
301
303
|
- spec/spec_helper.rb
|