wordtree 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/wordtree/book.rb +38 -1
- data/lib/wordtree/disk/librarian.rb +31 -2
- data/lib/wordtree/disk/library.rb +8 -5
- data/lib/wordtree/text_utils.rb +31 -0
- data/lib/wordtree/version.rb +1 -1
- data/spec/fixtures/library/bo/ok/book/book.1grams.json +3 -0
- data/spec/wordtree/book_spec.rb +43 -0
- data/spec/wordtree/disk/librarian_spec.rb +22 -1
- data/spec/wordtree/disk/library_spec.rb +6 -0
- data/spec/wordtree/text_utils_spec.rb +30 -4
- metadata +3 -1
data/lib/wordtree/book.rb
CHANGED
@@ -21,6 +21,11 @@ module WordTree
|
|
21
21
|
|
22
22
|
attribute :content, String
|
23
23
|
|
24
|
+
def initialize(*args)
|
25
|
+
super
|
26
|
+
@ngrams = {}
|
27
|
+
end
|
28
|
+
|
24
29
|
def self.create(id, metadata, content)
|
25
30
|
new(metadata.merge("id" => id, "content" => content))
|
26
31
|
end
|
@@ -34,13 +39,45 @@ module WordTree
|
|
34
39
|
end
|
35
40
|
|
36
41
|
def content_clean(wrap=120)
|
37
|
-
|
42
|
+
if @content_clean_wrap != wrap
|
43
|
+
# Memoize content_clean (using last wrap size)
|
44
|
+
@content_clean_wrap = wrap
|
45
|
+
@content_clean = TextUtils.clean_text(content, wrap)
|
46
|
+
end
|
47
|
+
@content_clean
|
38
48
|
end
|
39
49
|
|
40
50
|
def content_size
|
41
51
|
content ? content.size : nil
|
42
52
|
end
|
43
53
|
|
54
|
+
def each_ngram(n=1, &block)
|
55
|
+
TextUtils.each_ngram(content_clean, n, &block)
|
56
|
+
end
|
57
|
+
|
58
|
+
def set_ngrams(n, lookup)
|
59
|
+
raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
|
60
|
+
@ngrams[n] = lookup
|
61
|
+
end
|
62
|
+
|
63
|
+
def ngrams(n=1)
|
64
|
+
# Memoize ngram counts
|
65
|
+
@ngrams[n] ||= count_ngrams(n)
|
66
|
+
end
|
67
|
+
|
68
|
+
def all_ngrams
|
69
|
+
@ngrams
|
70
|
+
end
|
71
|
+
|
72
|
+
def count_ngrams(n=1)
|
73
|
+
{}.tap do |tally|
|
74
|
+
each_ngram(n) do |ngram|
|
75
|
+
tally[ngram] ||= 0
|
76
|
+
tally[ngram] += 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
44
81
|
def calculate_simhash
|
45
82
|
content ? content_clean.simhash(:split_by => /\s/) : nil
|
46
83
|
end
|
@@ -20,7 +20,7 @@ module WordTree
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def find_without_ngrams(book_id)
|
24
24
|
begin
|
25
25
|
retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
|
26
26
|
Book.create(book_id, retrieved.metadata, retrieved.content)
|
@@ -29,6 +29,20 @@ module WordTree
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
+
def find(book_id)
|
33
|
+
find_without_ngrams(book_id).tap do |book|
|
34
|
+
(1..9).each do |n|
|
35
|
+
path = library.path_to(book_id, :ngrams, :n => n)
|
36
|
+
if File.exist?(path)
|
37
|
+
File.open(path) do |f|
|
38
|
+
hash = JSON.load(f)
|
39
|
+
book.set_ngrams(n, hash)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
32
46
|
def each(file_suffix_re=/\.(md|txt)$/, &block)
|
33
47
|
library.each(file_suffix_re) do |path|
|
34
48
|
retrieved = Preamble.load(path, :external_encoding => "utf-8")
|
@@ -36,11 +50,26 @@ module WordTree
|
|
36
50
|
end
|
37
51
|
end
|
38
52
|
|
39
|
-
def
|
53
|
+
def save_without_ngrams(book)
|
40
54
|
library.mkdir(book.id)
|
41
55
|
Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
|
42
56
|
end
|
43
57
|
|
58
|
+
def save_ngrams(book)
|
59
|
+
book.all_ngrams.each_pair do |n, hash|
|
60
|
+
path = library.path_to(book.id, :ngrams, :n => n)
|
61
|
+
File.open(path, "w") do |file|
|
62
|
+
file.write hash.to_json
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def save(book)
|
68
|
+
save_without_ngrams(book).tap do
|
69
|
+
save_ngrams(book)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
44
73
|
def archive_org_get(*book_ids, &block)
|
45
74
|
book_ids.map do |book_id|
|
46
75
|
archive_org_get_with_conditions(identifier: book_id, &block)
|
@@ -10,7 +10,8 @@ module WordTree
|
|
10
10
|
include Enumerable
|
11
11
|
|
12
12
|
FILE_TYPES = {
|
13
|
-
:raw => "%
|
13
|
+
:raw => "%{id}.md",
|
14
|
+
:ngrams => "%{id}.%{n}grams.json"
|
14
15
|
}
|
15
16
|
|
16
17
|
# The file path to the root of the library directory, e.g. /data/library
|
@@ -26,13 +27,15 @@ module WordTree
|
|
26
27
|
File.expand_path(LibraryLocator.identity(book_id).relpath, root)
|
27
28
|
end
|
28
29
|
|
29
|
-
def path_to(book_id, type=:raw)
|
30
|
-
File.join(dir_of(book_id), file_type(book_id, type))
|
30
|
+
def path_to(book_id, type=:raw, opts={})
|
31
|
+
File.join(dir_of(book_id), file_type(book_id, type, opts))
|
31
32
|
end
|
32
33
|
|
33
|
-
def file_type(book_id, type=:raw)
|
34
|
+
def file_type(book_id, type=:raw, opts={})
|
34
35
|
locator = LibraryLocator.identity(book_id)
|
35
|
-
FILE_TYPES[type]
|
36
|
+
template = FILE_TYPES[type]
|
37
|
+
raise ArgumentError, "unable to find file type template #{type.inspect}" if template.nil?
|
38
|
+
template % {:id => locator.id}.merge(opts)
|
36
39
|
end
|
37
40
|
|
38
41
|
# Create all subdirs up to the location where a book is stored
|
data/lib/wordtree/text_utils.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
1
3
|
module WordTree
|
2
4
|
module TextUtils
|
3
5
|
def self.split_near(text, split_index)
|
@@ -34,9 +36,12 @@ module WordTree
|
|
34
36
|
_dash = '-'.ord
|
35
37
|
_space = ' '.ord
|
36
38
|
_newline = "\n".ord
|
39
|
+
_period = '.'.ord
|
40
|
+
_question = '?'.ord
|
37
41
|
|
38
42
|
join_lines = false
|
39
43
|
just_added_space = false
|
44
|
+
just_added_period = false
|
40
45
|
line_length = 0
|
41
46
|
input.each_char do |c|
|
42
47
|
c = c.ord
|
@@ -44,17 +49,28 @@ module WordTree
|
|
44
49
|
c -= 32 if (c >= _A && c <= _Z)
|
45
50
|
# Change newlines to spaces
|
46
51
|
c = _space if c == _newline
|
52
|
+
# Change question marks to periods (i.e. both count as sentence boundaries)
|
53
|
+
c = _period if c == _question
|
47
54
|
|
48
55
|
if c == _dash
|
49
56
|
# In case of a dash, set the scoop-spaces-up flag
|
50
57
|
join_lines = true
|
51
58
|
elsif join_lines && (c == _space)
|
52
59
|
# ignore
|
60
|
+
elsif (c == _period) && !just_added_period
|
61
|
+
if !just_added_space
|
62
|
+
output << _space.chr
|
63
|
+
end
|
64
|
+
output << c.chr
|
65
|
+
just_added_period = true
|
66
|
+
just_added_space = true
|
53
67
|
elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
|
54
68
|
# Add letters and spaces
|
69
|
+
output << _space.chr if just_added_period
|
55
70
|
output << c.chr
|
56
71
|
line_length += 1
|
57
72
|
just_added_space = (c == _space)
|
73
|
+
just_added_period = false
|
58
74
|
join_lines = false
|
59
75
|
end
|
60
76
|
end
|
@@ -69,5 +85,20 @@ module WordTree
|
|
69
85
|
|
70
86
|
return wrapped_output
|
71
87
|
end
|
88
|
+
|
89
|
+
def self.each_ngram(input, n=1, &block)
|
90
|
+
onegram_re = /([^ \n]+[ \n])/
|
91
|
+
ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
|
92
|
+
s = StringScanner.new(input)
|
93
|
+
while !s.eos?
|
94
|
+
if words = s.scan(ngram_re)
|
95
|
+
yield words.rstrip.tr("\n", " ") if block_given?
|
96
|
+
# Move back to beginning of n-word sequence
|
97
|
+
s.unscan
|
98
|
+
end
|
99
|
+
# Move forward one word
|
100
|
+
s.scan(onegram_re)
|
101
|
+
end
|
102
|
+
end
|
72
103
|
end
|
73
104
|
end
|
data/lib/wordtree/version.rb
CHANGED
data/spec/wordtree/book_spec.rb
CHANGED
@@ -16,4 +16,47 @@ describe WordTree::Book do
|
|
16
16
|
book = WordTree::Book.create("book", {}, "Wi&ld\nContent!")
|
17
17
|
expect(book.content_clean).to eq("wild content\n")
|
18
18
|
end
|
19
|
+
|
20
|
+
context "ngrams" do
|
21
|
+
let(:content) { "A man. A plan. And a man."}
|
22
|
+
let(:book) { WordTree::Book.create("book", {}, content) }
|
23
|
+
let(:one_grams) { { "a" => 3, "man" => 2, "plan" => 1, "and" => 1, "." => 3 } }
|
24
|
+
let(:two_grams) {
|
25
|
+
{"a man" => 2, "man ." => 2, ". a" => 1, "a plan" => 1,
|
26
|
+
"plan ." => 1, ". and" => 1, "and a" => 1}
|
27
|
+
}
|
28
|
+
describe "#count_ngrams" do
|
29
|
+
it "creates a hash lookup table" do
|
30
|
+
hash = book.count_ngrams(1)
|
31
|
+
expect(hash).to be_a(Hash)
|
32
|
+
end
|
33
|
+
|
34
|
+
it "has counts of ngrams" do
|
35
|
+
hash = book.count_ngrams(1)
|
36
|
+
expect(hash).to eq(one_grams)
|
37
|
+
hash = book.count_ngrams(2)
|
38
|
+
expect(hash).to eq(two_grams)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "memoizes ngrams" do
|
42
|
+
expect(book).to receive(:count_ngrams).with(1).and_return(one_grams)
|
43
|
+
expect(book.ngrams(1)).to eq one_grams
|
44
|
+
expect(book).to_not receive(:count_ngrams)
|
45
|
+
expect(book.ngrams(1)).to eq one_grams
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe "#set_ngrams" do
|
50
|
+
it "sets the lookup hash" do
|
51
|
+
book.set_ngrams(1, {"one" => 1})
|
52
|
+
expect(book.ngrams(1)).to eq("one" => 1)
|
53
|
+
expect(book.ngrams(2)).to eq(two_grams)
|
54
|
+
end
|
55
|
+
|
56
|
+
it "raises an error when not a hash" do
|
57
|
+
expect{ book.set_ngrams(1, "string") }.to raise_error
|
58
|
+
expect{ book.set_ngrams(1, nil) }.to raise_error
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
19
62
|
end
|
@@ -32,6 +32,12 @@ describe WordTree::Disk::Librarian do
|
|
32
32
|
expect(book.year).to eq(1800)
|
33
33
|
expect(book.content).to eq("Book with content")
|
34
34
|
end
|
35
|
+
|
36
|
+
it "loads ngrams if available" do
|
37
|
+
book = librarian.find("book")
|
38
|
+
expect(book).to_not receive(:count_ngrams)
|
39
|
+
expect(book.ngrams(1)).to eq("xyz" => 1)
|
40
|
+
end
|
35
41
|
end
|
36
42
|
|
37
43
|
describe "#each" do
|
@@ -41,12 +47,27 @@ describe WordTree::Disk::Librarian do
|
|
41
47
|
end
|
42
48
|
end
|
43
49
|
|
44
|
-
it "saves to disk
|
50
|
+
it "saves ngrams to disk" do
|
45
51
|
tmp_root = Dir.mktmpdir
|
46
52
|
tmp_library = WordTree::Disk::Library.new(tmp_root)
|
47
53
|
tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
|
48
54
|
|
49
55
|
book = librarian.find("book")
|
56
|
+
book.ngrams(1)
|
57
|
+
book.ngrams(2)
|
58
|
+
|
59
|
+
tmp_librarian.save(book)
|
60
|
+
|
61
|
+
ngrams_filepath = tmp_library.path_to("book", :ngrams, :n => 1)
|
62
|
+
expect(File.exist?(ngrams_filepath)).to be_truthy
|
63
|
+
end
|
64
|
+
|
65
|
+
it "saves to disk (yaml, content)" do
|
66
|
+
tmp_root = Dir.mktmpdir
|
67
|
+
tmp_library = WordTree::Disk::Library.new(tmp_root)
|
68
|
+
tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
|
69
|
+
|
70
|
+
book = librarian.find_without_ngrams("book")
|
50
71
|
|
51
72
|
book.source = "test"
|
52
73
|
book.content += "."
|
@@ -33,8 +33,8 @@ describe WordTree::TextUtils do
|
|
33
33
|
end
|
34
34
|
|
35
35
|
context "#clean_text" do
|
36
|
-
let(:sample_text) { "This, [here] is awesome, right?" }
|
37
36
|
it "wraps" do
|
37
|
+
sample_text = "This, [here] is awesome, right"
|
38
38
|
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
39
39
|
expect(cleaned).to eq("this here\nis awesome\nright\n")
|
40
40
|
|
@@ -45,10 +45,36 @@ describe WordTree::TextUtils do
|
|
45
45
|
expect(cleaned).to eq("this here is awesome right\n")
|
46
46
|
end
|
47
47
|
|
48
|
-
let(:sample_dash) { "What-\never\ndo you\n mean?"}
|
49
48
|
it "joins lines ending in -" do
|
50
|
-
|
51
|
-
|
49
|
+
sample_text = "What-\never\ndo you\n mean?"
|
50
|
+
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
51
|
+
expect(cleaned).to eq("whatever\ndo you\nmean .\n")
|
52
|
+
end
|
53
|
+
|
54
|
+
it "does not ignore sentence boundaries" do
|
55
|
+
sample_text = "This is a sentence. And so is this? Keep the dots."
|
56
|
+
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
57
|
+
expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
|
58
|
+
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
59
|
+
expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
|
60
|
+
end
|
61
|
+
|
62
|
+
it "compresses sentence boundary punctuation and spaces" do
|
63
|
+
sample_text = "words . . and.. stuff"
|
64
|
+
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
65
|
+
expect(cleaned).to eq("words . and . stuff\n")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
context "#each_ngram" do
|
70
|
+
it "yields ngrams in succession" do
|
71
|
+
sample_text = "one word\n. two\n"
|
72
|
+
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
|
73
|
+
yield_successive_args("one", "word", ".", "two")
|
74
|
+
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
|
75
|
+
yield_successive_args("one word", "word .", ". two")
|
76
|
+
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
|
77
|
+
yield_successive_args("one word .", "word . two")
|
52
78
|
end
|
53
79
|
end
|
54
80
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordtree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -259,6 +259,7 @@ files:
|
|
259
259
|
- lib/wordtree/text_utils.rb
|
260
260
|
- lib/wordtree/version.rb
|
261
261
|
- spec/fixtures/cassettes/archive_org_download_book.yml
|
262
|
+
- spec/fixtures/library/bo/ok/book/book.1grams.json
|
262
263
|
- spec/fixtures/library/bo/ok/book/book.md
|
263
264
|
- spec/fixtures/library/ot/er/other/other.md
|
264
265
|
- spec/spec_helper.rb
|
@@ -296,6 +297,7 @@ specification_version: 3
|
|
296
297
|
summary: Wordtree common library code
|
297
298
|
test_files:
|
298
299
|
- spec/fixtures/cassettes/archive_org_download_book.yml
|
300
|
+
- spec/fixtures/library/bo/ok/book/book.1grams.json
|
299
301
|
- spec/fixtures/library/bo/ok/book/book.md
|
300
302
|
- spec/fixtures/library/ot/er/other/other.md
|
301
303
|
- spec/spec_helper.rb
|