wordtree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wordtree/book.rb CHANGED
@@ -21,6 +21,11 @@ module WordTree
21
21
 
22
22
  attribute :content, String
23
23
 
24
+ def initialize(*args)
25
+ super
26
+ @ngrams = {}
27
+ end
28
+
24
29
  def self.create(id, metadata, content)
25
30
  new(metadata.merge("id" => id, "content" => content))
26
31
  end
@@ -34,13 +39,45 @@ module WordTree
34
39
  end
35
40
 
36
41
  def content_clean(wrap=120)
37
- TextUtils.clean_text(content, wrap)
42
+ if @content_clean_wrap != wrap
43
+ # Memoize content_clean (using last wrap size)
44
+ @content_clean_wrap = wrap
45
+ @content_clean = TextUtils.clean_text(content, wrap)
46
+ end
47
+ @content_clean
38
48
  end
39
49
 
40
50
  def content_size
41
51
  content ? content.size : nil
42
52
  end
43
53
 
54
+ def each_ngram(n=1, &block)
55
+ TextUtils.each_ngram(content_clean, n, &block)
56
+ end
57
+
58
+ def set_ngrams(n, lookup)
59
+ raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
60
+ @ngrams[n] = lookup
61
+ end
62
+
63
+ def ngrams(n=1)
64
+ # Memoize ngram counts
65
+ @ngrams[n] ||= count_ngrams(n)
66
+ end
67
+
68
+ def all_ngrams
69
+ @ngrams
70
+ end
71
+
72
+ def count_ngrams(n=1)
73
+ {}.tap do |tally|
74
+ each_ngram(n) do |ngram|
75
+ tally[ngram] ||= 0
76
+ tally[ngram] += 1
77
+ end
78
+ end
79
+ end
80
+
44
81
  def calculate_simhash
45
82
  content ? content_clean.simhash(:split_by => /\s/) : nil
46
83
  end
@@ -20,7 +20,7 @@ module WordTree
20
20
  end
21
21
  end
22
22
 
23
- def find(book_id)
23
+ def find_without_ngrams(book_id)
24
24
  begin
25
25
  retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
26
26
  Book.create(book_id, retrieved.metadata, retrieved.content)
@@ -29,6 +29,20 @@ module WordTree
29
29
  end
30
30
  end
31
31
 
32
+ def find(book_id)
33
+ find_without_ngrams(book_id).tap do |book|
34
+ (1..9).each do |n|
35
+ path = library.path_to(book_id, :ngrams, :n => n)
36
+ if File.exist?(path)
37
+ File.open(path) do |f|
38
+ hash = JSON.load(f)
39
+ book.set_ngrams(n, hash)
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
45
+
32
46
  def each(file_suffix_re=/\.(md|txt)$/, &block)
33
47
  library.each(file_suffix_re) do |path|
34
48
  retrieved = Preamble.load(path, :external_encoding => "utf-8")
@@ -36,11 +50,26 @@ module WordTree
36
50
  end
37
51
  end
38
52
 
39
- def save(book)
53
+ def save_without_ngrams(book)
40
54
  library.mkdir(book.id)
41
55
  Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
42
56
  end
43
57
 
58
+ def save_ngrams(book)
59
+ book.all_ngrams.each_pair do |n, hash|
60
+ path = library.path_to(book.id, :ngrams, :n => n)
61
+ File.open(path, "w") do |file|
62
+ file.write hash.to_json
63
+ end
64
+ end
65
+ end
66
+
67
+ def save(book)
68
+ save_without_ngrams(book).tap do
69
+ save_ngrams(book)
70
+ end
71
+ end
72
+
44
73
  def archive_org_get(*book_ids, &block)
45
74
  book_ids.map do |book_id|
46
75
  archive_org_get_with_conditions(identifier: book_id, &block)
@@ -10,7 +10,8 @@ module WordTree
10
10
  include Enumerable
11
11
 
12
12
  FILE_TYPES = {
13
- :raw => "%s.md"
13
+ :raw => "%{id}.md",
14
+ :ngrams => "%{id}.%{n}grams.json"
14
15
  }
15
16
 
16
17
  # The file path to the root of the library directory, e.g. /data/library
@@ -26,13 +27,15 @@ module WordTree
26
27
  File.expand_path(LibraryLocator.identity(book_id).relpath, root)
27
28
  end
28
29
 
29
- def path_to(book_id, type=:raw)
30
- File.join(dir_of(book_id), file_type(book_id, type))
30
+ def path_to(book_id, type=:raw, opts={})
31
+ File.join(dir_of(book_id), file_type(book_id, type, opts))
31
32
  end
32
33
 
33
- def file_type(book_id, type=:raw)
34
+ def file_type(book_id, type=:raw, opts={})
34
35
  locator = LibraryLocator.identity(book_id)
35
- FILE_TYPES[type] % locator.id
36
+ template = FILE_TYPES[type]
37
+ raise ArgumentError, "unable to find file type template #{type.inspect}" if template.nil?
38
+ template % {:id => locator.id}.merge(opts)
36
39
  end
37
40
 
38
41
  # Create all subdirs up to the location where a book is stored
@@ -1,3 +1,5 @@
1
+ require 'strscan'
2
+
1
3
  module WordTree
2
4
  module TextUtils
3
5
  def self.split_near(text, split_index)
@@ -34,9 +36,12 @@ module WordTree
34
36
  _dash = '-'.ord
35
37
  _space = ' '.ord
36
38
  _newline = "\n".ord
39
+ _period = '.'.ord
40
+ _question = '?'.ord
37
41
 
38
42
  join_lines = false
39
43
  just_added_space = false
44
+ just_added_period = false
40
45
  line_length = 0
41
46
  input.each_char do |c|
42
47
  c = c.ord
@@ -44,17 +49,28 @@ module WordTree
44
49
  c -= 32 if (c >= _A && c <= _Z)
45
50
  # Change newlines to spaces
46
51
  c = _space if c == _newline
52
+ # Change question marks to periods (i.e. both count as sentence boundaries)
53
+ c = _period if c == _question
47
54
 
48
55
  if c == _dash
49
56
  # In case of a dash, set the scoop-spaces-up flag
50
57
  join_lines = true
51
58
  elsif join_lines && (c == _space)
52
59
  # ignore
60
+ elsif (c == _period) && !just_added_period
61
+ if !just_added_space
62
+ output << _space.chr
63
+ end
64
+ output << c.chr
65
+ just_added_period = true
66
+ just_added_space = true
53
67
  elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
54
68
  # Add letters and spaces
69
+ output << _space.chr if just_added_period
55
70
  output << c.chr
56
71
  line_length += 1
57
72
  just_added_space = (c == _space)
73
+ just_added_period = false
58
74
  join_lines = false
59
75
  end
60
76
  end
@@ -69,5 +85,20 @@ module WordTree
69
85
 
70
86
  return wrapped_output
71
87
  end
88
+
89
+ def self.each_ngram(input, n=1, &block)
90
+ onegram_re = /([^ \n]+[ \n])/
91
+ ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
92
+ s = StringScanner.new(input)
93
+ while !s.eos?
94
+ if words = s.scan(ngram_re)
95
+ yield words.rstrip.tr("\n", " ") if block_given?
96
+ # Move back to beginning of n-word sequence
97
+ s.unscan
98
+ end
99
+ # Move forward one word
100
+ s.scan(onegram_re)
101
+ end
102
+ end
72
103
  end
73
104
  end
@@ -1,3 +1,3 @@
1
1
  module Wordtree
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -0,0 +1,3 @@
1
+ {
2
+ "xyz": 1
3
+ }
@@ -16,4 +16,47 @@ describe WordTree::Book do
16
16
  book = WordTree::Book.create("book", {}, "Wi&ld\nContent!")
17
17
  expect(book.content_clean).to eq("wild content\n")
18
18
  end
19
+
20
+ context "ngrams" do
21
+ let(:content) { "A man. A plan. And a man."}
22
+ let(:book) { WordTree::Book.create("book", {}, content) }
23
+ let(:one_grams) { { "a" => 3, "man" => 2, "plan" => 1, "and" => 1, "." => 3 } }
24
+ let(:two_grams) {
25
+ {"a man" => 2, "man ." => 2, ". a" => 1, "a plan" => 1,
26
+ "plan ." => 1, ". and" => 1, "and a" => 1}
27
+ }
28
+ describe "#count_ngrams" do
29
+ it "creates a hash lookup table" do
30
+ hash = book.count_ngrams(1)
31
+ expect(hash).to be_a(Hash)
32
+ end
33
+
34
+ it "has counts of ngrams" do
35
+ hash = book.count_ngrams(1)
36
+ expect(hash).to eq(one_grams)
37
+ hash = book.count_ngrams(2)
38
+ expect(hash).to eq(two_grams)
39
+ end
40
+
41
+ it "memoizes ngrams" do
42
+ expect(book).to receive(:count_ngrams).with(1).and_return(one_grams)
43
+ expect(book.ngrams(1)).to eq one_grams
44
+ expect(book).to_not receive(:count_ngrams)
45
+ expect(book.ngrams(1)).to eq one_grams
46
+ end
47
+ end
48
+
49
+ describe "#set_ngrams" do
50
+ it "sets the lookup hash" do
51
+ book.set_ngrams(1, {"one" => 1})
52
+ expect(book.ngrams(1)).to eq("one" => 1)
53
+ expect(book.ngrams(2)).to eq(two_grams)
54
+ end
55
+
56
+ it "raises an error when not a hash" do
57
+ expect{ book.set_ngrams(1, "string") }.to raise_error
58
+ expect{ book.set_ngrams(1, nil) }.to raise_error
59
+ end
60
+ end
61
+ end
19
62
  end
@@ -32,6 +32,12 @@ describe WordTree::Disk::Librarian do
32
32
  expect(book.year).to eq(1800)
33
33
  expect(book.content).to eq("Book with content")
34
34
  end
35
+
36
+ it "loads ngrams if available" do
37
+ book = librarian.find("book")
38
+ expect(book).to_not receive(:count_ngrams)
39
+ expect(book.ngrams(1)).to eq("xyz" => 1)
40
+ end
35
41
  end
36
42
 
37
43
  describe "#each" do
@@ -41,12 +47,27 @@ describe WordTree::Disk::Librarian do
41
47
  end
42
48
  end
43
49
 
44
- it "saves to disk (yaml, content)" do
50
+ it "saves ngrams to disk" do
45
51
  tmp_root = Dir.mktmpdir
46
52
  tmp_library = WordTree::Disk::Library.new(tmp_root)
47
53
  tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
48
54
 
49
55
  book = librarian.find("book")
56
+ book.ngrams(1)
57
+ book.ngrams(2)
58
+
59
+ tmp_librarian.save(book)
60
+
61
+ ngrams_filepath = tmp_library.path_to("book", :ngrams, :n => 1)
62
+ expect(File.exist?(ngrams_filepath)).to be_truthy
63
+ end
64
+
65
+ it "saves to disk (yaml, content)" do
66
+ tmp_root = Dir.mktmpdir
67
+ tmp_library = WordTree::Disk::Library.new(tmp_root)
68
+ tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
69
+
70
+ book = librarian.find_without_ngrams("book")
50
71
 
51
72
  book.source = "test"
52
73
  book.content += "."
@@ -31,4 +31,10 @@ describe WordTree::Disk::Library do
31
31
  end
32
32
  end
33
33
  end
34
+
35
+ describe "#file_type" do
36
+ it "interpolates n" do
37
+ expect(library.file_type("abc", :ngrams, :n => 1)).to eq("abc.1grams.json")
38
+ end
39
+ end
34
40
  end
@@ -33,8 +33,8 @@ describe WordTree::TextUtils do
33
33
  end
34
34
 
35
35
  context "#clean_text" do
36
- let(:sample_text) { "This, [here] is awesome, right?" }
37
36
  it "wraps" do
37
+ sample_text = "This, [here] is awesome, right"
38
38
  cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
39
39
  expect(cleaned).to eq("this here\nis awesome\nright\n")
40
40
 
@@ -45,10 +45,36 @@ describe WordTree::TextUtils do
45
45
  expect(cleaned).to eq("this here is awesome right\n")
46
46
  end
47
47
 
48
- let(:sample_dash) { "What-\never\ndo you\n mean?"}
49
48
  it "joins lines ending in -" do
50
- cleaned = WordTree::TextUtils.clean_text(sample_dash, 10)
51
- expect(cleaned).to eq("whatever\ndo you\nmean\n")
49
+ sample_text = "What-\never\ndo you\n mean?"
50
+ cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
51
+ expect(cleaned).to eq("whatever\ndo you\nmean .\n")
52
+ end
53
+
54
+ it "does not ignore sentence boundaries" do
55
+ sample_text = "This is a sentence. And so is this? Keep the dots."
56
+ cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
57
+ expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
58
+ cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
59
+ expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
60
+ end
61
+
62
+ it "compresses sentence boundary punctuation and spaces" do
63
+ sample_text = "words . . and.. stuff"
64
+ cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
65
+ expect(cleaned).to eq("words . and . stuff\n")
66
+ end
67
+ end
68
+
69
+ context "#each_ngram" do
70
+ it "yields ngrams in succession" do
71
+ sample_text = "one word\n. two\n"
72
+ expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
73
+ yield_successive_args("one", "word", ".", "two")
74
+ expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
75
+ yield_successive_args("one word", "word .", ". two")
76
+ expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
77
+ yield_successive_args("one word .", "word . two")
52
78
  end
53
79
  end
54
80
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -259,6 +259,7 @@ files:
259
259
  - lib/wordtree/text_utils.rb
260
260
  - lib/wordtree/version.rb
261
261
  - spec/fixtures/cassettes/archive_org_download_book.yml
262
+ - spec/fixtures/library/bo/ok/book/book.1grams.json
262
263
  - spec/fixtures/library/bo/ok/book/book.md
263
264
  - spec/fixtures/library/ot/er/other/other.md
264
265
  - spec/spec_helper.rb
@@ -296,6 +297,7 @@ specification_version: 3
296
297
  summary: Wordtree common library code
297
298
  test_files:
298
299
  - spec/fixtures/cassettes/archive_org_download_book.yml
300
+ - spec/fixtures/library/bo/ok/book/book.1grams.json
299
301
  - spec/fixtures/library/bo/ok/book/book.md
300
302
  - spec/fixtures/library/ot/er/other/other.md
301
303
  - spec/spec_helper.rb