wordtree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wordtree/book.rb CHANGED
@@ -21,6 +21,11 @@ module WordTree
21
21
 
22
22
  attribute :content, String
23
23
 
24
+ def initialize(*args)
25
+ super
26
+ @ngrams = {}
27
+ end
28
+
24
29
  def self.create(id, metadata, content)
25
30
  new(metadata.merge("id" => id, "content" => content))
26
31
  end
@@ -34,13 +39,45 @@ module WordTree
34
39
  end
35
40
 
36
41
  def content_clean(wrap=120)
37
- TextUtils.clean_text(content, wrap)
42
+ if @content_clean_wrap != wrap
43
+ # Memoize content_clean (using last wrap size)
44
+ @content_clean_wrap = wrap
45
+ @content_clean = TextUtils.clean_text(content, wrap)
46
+ end
47
+ @content_clean
38
48
  end
39
49
 
40
50
  def content_size
41
51
  content ? content.size : nil
42
52
  end
43
53
 
54
+ def each_ngram(n=1, &block)
55
+ TextUtils.each_ngram(content_clean, n, &block)
56
+ end
57
+
58
+ def set_ngrams(n, lookup)
59
+ raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
60
+ @ngrams[n] = lookup
61
+ end
62
+
63
+ def ngrams(n=1)
64
+ # Memoize ngram counts
65
+ @ngrams[n] ||= count_ngrams(n)
66
+ end
67
+
68
+ def all_ngrams
69
+ @ngrams
70
+ end
71
+
72
+ def count_ngrams(n=1)
73
+ {}.tap do |tally|
74
+ each_ngram(n) do |ngram|
75
+ tally[ngram] ||= 0
76
+ tally[ngram] += 1
77
+ end
78
+ end
79
+ end
80
+
44
81
  def calculate_simhash
45
82
  content ? content_clean.simhash(:split_by => /\s/) : nil
46
83
  end
@@ -20,7 +20,7 @@ module WordTree
20
20
  end
21
21
  end
22
22
 
23
- def find(book_id)
23
+ def find_without_ngrams(book_id)
24
24
  begin
25
25
  retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
26
26
  Book.create(book_id, retrieved.metadata, retrieved.content)
@@ -29,6 +29,20 @@ module WordTree
29
29
  end
30
30
  end
31
31
 
32
+ def find(book_id)
33
+ find_without_ngrams(book_id).tap do |book|
34
+ (1..9).each do |n|
35
+ path = library.path_to(book_id, :ngrams, :n => n)
36
+ if File.exist?(path)
37
+ File.open(path) do |f|
38
+ hash = JSON.load(f)
39
+ book.set_ngrams(n, hash)
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
45
+
32
46
  def each(file_suffix_re=/\.(md|txt)$/, &block)
33
47
  library.each(file_suffix_re) do |path|
34
48
  retrieved = Preamble.load(path, :external_encoding => "utf-8")
@@ -36,11 +50,26 @@ module WordTree
36
50
  end
37
51
  end
38
52
 
39
- def save(book)
53
+ def save_without_ngrams(book)
40
54
  library.mkdir(book.id)
41
55
  Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
42
56
  end
43
57
 
58
+ def save_ngrams(book)
59
+ book.all_ngrams.each_pair do |n, hash|
60
+ path = library.path_to(book.id, :ngrams, :n => n)
61
+ File.open(path, "w") do |file|
62
+ file.write hash.to_json
63
+ end
64
+ end
65
+ end
66
+
67
+ def save(book)
68
+ save_without_ngrams(book).tap do
69
+ save_ngrams(book)
70
+ end
71
+ end
72
+
44
73
  def archive_org_get(*book_ids, &block)
45
74
  book_ids.map do |book_id|
46
75
  archive_org_get_with_conditions(identifier: book_id, &block)
@@ -10,7 +10,8 @@ module WordTree
10
10
  include Enumerable
11
11
 
12
12
  FILE_TYPES = {
13
- :raw => "%s.md"
13
+ :raw => "%{id}.md",
14
+ :ngrams => "%{id}.%{n}grams.json"
14
15
  }
15
16
 
16
17
  # The file path to the root of the library directory, e.g. /data/library
@@ -26,13 +27,15 @@ module WordTree
26
27
  File.expand_path(LibraryLocator.identity(book_id).relpath, root)
27
28
  end
28
29
 
29
- def path_to(book_id, type=:raw)
30
- File.join(dir_of(book_id), file_type(book_id, type))
30
+ def path_to(book_id, type=:raw, opts={})
31
+ File.join(dir_of(book_id), file_type(book_id, type, opts))
31
32
  end
32
33
 
33
- def file_type(book_id, type=:raw)
34
+ def file_type(book_id, type=:raw, opts={})
34
35
  locator = LibraryLocator.identity(book_id)
35
- FILE_TYPES[type] % locator.id
36
+ template = FILE_TYPES[type]
37
+ raise ArgumentError, "unable to find file type template #{type.inspect}" if template.nil?
38
+ template % {:id => locator.id}.merge(opts)
36
39
  end
37
40
 
38
41
  # Create all subdirs up to the location where a book is stored
@@ -1,3 +1,5 @@
1
+ require 'strscan'
2
+
1
3
  module WordTree
2
4
  module TextUtils
3
5
  def self.split_near(text, split_index)
@@ -34,9 +36,12 @@ module WordTree
34
36
  _dash = '-'.ord
35
37
  _space = ' '.ord
36
38
  _newline = "\n".ord
39
+ _period = '.'.ord
40
+ _question = '?'.ord
37
41
 
38
42
  join_lines = false
39
43
  just_added_space = false
44
+ just_added_period = false
40
45
  line_length = 0
41
46
  input.each_char do |c|
42
47
  c = c.ord
@@ -44,17 +49,28 @@ module WordTree
44
49
  c -= 32 if (c >= _A && c <= _Z)
45
50
  # Change newlines to spaces
46
51
  c = _space if c == _newline
52
+ # Change question marks to periods (i.e. both count as sentence boundaries)
53
+ c = _period if c == _question
47
54
 
48
55
  if c == _dash
49
56
  # In case of a dash, set the scoop-spaces-up flag
50
57
  join_lines = true
51
58
  elsif join_lines && (c == _space)
52
59
  # ignore
60
+ elsif (c == _period) && !just_added_period
61
+ if !just_added_space
62
+ output << _space.chr
63
+ end
64
+ output << c.chr
65
+ just_added_period = true
66
+ just_added_space = true
53
67
  elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
54
68
  # Add letters and spaces
69
+ output << _space.chr if just_added_period
55
70
  output << c.chr
56
71
  line_length += 1
57
72
  just_added_space = (c == _space)
73
+ just_added_period = false
58
74
  join_lines = false
59
75
  end
60
76
  end
@@ -69,5 +85,20 @@ module WordTree
69
85
 
70
86
  return wrapped_output
71
87
  end
88
+
89
+ def self.each_ngram(input, n=1, &block)
90
+ onegram_re = /([^ \n]+[ \n])/
91
+ ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
92
+ s = StringScanner.new(input)
93
+ while !s.eos?
94
+ if words = s.scan(ngram_re)
95
+ yield words.rstrip.tr("\n", " ") if block_given?
96
+ # Move back to beginning of n-word sequence
97
+ s.unscan
98
+ end
99
+ # Move forward one word
100
+ s.scan(onegram_re)
101
+ end
102
+ end
72
103
  end
73
104
  end
@@ -1,3 +1,3 @@
1
1
  module Wordtree
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -0,0 +1,3 @@
1
+ {
2
+ "xyz": 1
3
+ }
@@ -16,4 +16,47 @@ describe WordTree::Book do
16
16
  book = WordTree::Book.create("book", {}, "Wi&ld\nContent!")
17
17
  expect(book.content_clean).to eq("wild content\n")
18
18
  end
19
+
20
+ context "ngrams" do
21
+ let(:content) { "A man. A plan. And a man."}
22
+ let(:book) { WordTree::Book.create("book", {}, content) }
23
+ let(:one_grams) { { "a" => 3, "man" => 2, "plan" => 1, "and" => 1, "." => 3 } }
24
+ let(:two_grams) {
25
+ {"a man" => 2, "man ." => 2, ". a" => 1, "a plan" => 1,
26
+ "plan ." => 1, ". and" => 1, "and a" => 1}
27
+ }
28
+ describe "#count_ngrams" do
29
+ it "creates a hash lookup table" do
30
+ hash = book.count_ngrams(1)
31
+ expect(hash).to be_a(Hash)
32
+ end
33
+
34
+ it "has counts of ngrams" do
35
+ hash = book.count_ngrams(1)
36
+ expect(hash).to eq(one_grams)
37
+ hash = book.count_ngrams(2)
38
+ expect(hash).to eq(two_grams)
39
+ end
40
+
41
+ it "memoizes ngrams" do
42
+ expect(book).to receive(:count_ngrams).with(1).and_return(one_grams)
43
+ expect(book.ngrams(1)).to eq one_grams
44
+ expect(book).to_not receive(:count_ngrams)
45
+ expect(book.ngrams(1)).to eq one_grams
46
+ end
47
+ end
48
+
49
+ describe "#set_ngrams" do
50
+ it "sets the lookup hash" do
51
+ book.set_ngrams(1, {"one" => 1})
52
+ expect(book.ngrams(1)).to eq("one" => 1)
53
+ expect(book.ngrams(2)).to eq(two_grams)
54
+ end
55
+
56
+ it "raises an error when not a hash" do
57
+ expect{ book.set_ngrams(1, "string") }.to raise_error
58
+ expect{ book.set_ngrams(1, nil) }.to raise_error
59
+ end
60
+ end
61
+ end
19
62
  end
@@ -32,6 +32,12 @@ describe WordTree::Disk::Librarian do
32
32
  expect(book.year).to eq(1800)
33
33
  expect(book.content).to eq("Book with content")
34
34
  end
35
+
36
+ it "loads ngrams if available" do
37
+ book = librarian.find("book")
38
+ expect(book).to_not receive(:count_ngrams)
39
+ expect(book.ngrams(1)).to eq("xyz" => 1)
40
+ end
35
41
  end
36
42
 
37
43
  describe "#each" do
@@ -41,12 +47,27 @@ describe WordTree::Disk::Librarian do
41
47
  end
42
48
  end
43
49
 
44
- it "saves to disk (yaml, content)" do
50
+ it "saves ngrams to disk" do
45
51
  tmp_root = Dir.mktmpdir
46
52
  tmp_library = WordTree::Disk::Library.new(tmp_root)
47
53
  tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
48
54
 
49
55
  book = librarian.find("book")
56
+ book.ngrams(1)
57
+ book.ngrams(2)
58
+
59
+ tmp_librarian.save(book)
60
+
61
+ ngrams_filepath = tmp_library.path_to("book", :ngrams, :n => 1)
62
+ expect(File.exist?(ngrams_filepath)).to be_truthy
63
+ end
64
+
65
+ it "saves to disk (yaml, content)" do
66
+ tmp_root = Dir.mktmpdir
67
+ tmp_library = WordTree::Disk::Library.new(tmp_root)
68
+ tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
69
+
70
+ book = librarian.find_without_ngrams("book")
50
71
 
51
72
  book.source = "test"
52
73
  book.content += "."
@@ -31,4 +31,10 @@ describe WordTree::Disk::Library do
31
31
  end
32
32
  end
33
33
  end
34
+
35
+ describe "#file_type" do
36
+ it "interpolates n" do
37
+ expect(library.file_type("abc", :ngrams, :n => 1)).to eq("abc.1grams.json")
38
+ end
39
+ end
34
40
  end
@@ -33,8 +33,8 @@ describe WordTree::TextUtils do
33
33
  end
34
34
 
35
35
  context "#clean_text" do
36
- let(:sample_text) { "This, [here] is awesome, right?" }
37
36
  it "wraps" do
37
+ sample_text = "This, [here] is awesome, right"
38
38
  cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
39
39
  expect(cleaned).to eq("this here\nis awesome\nright\n")
40
40
 
@@ -45,10 +45,36 @@ describe WordTree::TextUtils do
45
45
  expect(cleaned).to eq("this here is awesome right\n")
46
46
  end
47
47
 
48
- let(:sample_dash) { "What-\never\ndo you\n mean?"}
49
48
  it "joins lines ending in -" do
50
- cleaned = WordTree::TextUtils.clean_text(sample_dash, 10)
51
- expect(cleaned).to eq("whatever\ndo you\nmean\n")
49
+ sample_text = "What-\never\ndo you\n mean?"
50
+ cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
51
+ expect(cleaned).to eq("whatever\ndo you\nmean .\n")
52
+ end
53
+
54
+ it "does not ignore sentence boundaries" do
55
+ sample_text = "This is a sentence. And so is this? Keep the dots."
56
+ cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
57
+ expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
58
+ cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
59
+ expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
60
+ end
61
+
62
+ it "compresses sentence boundary punctuation and spaces" do
63
+ sample_text = "words . . and.. stuff"
64
+ cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
65
+ expect(cleaned).to eq("words . and . stuff\n")
66
+ end
67
+ end
68
+
69
+ context "#each_ngram" do
70
+ it "yields ngrams in succession" do
71
+ sample_text = "one word\n. two\n"
72
+ expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
73
+ yield_successive_args("one", "word", ".", "two")
74
+ expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
75
+ yield_successive_args("one word", "word .", ". two")
76
+ expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
77
+ yield_successive_args("one word .", "word . two")
52
78
  end
53
79
  end
54
80
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -259,6 +259,7 @@ files:
259
259
  - lib/wordtree/text_utils.rb
260
260
  - lib/wordtree/version.rb
261
261
  - spec/fixtures/cassettes/archive_org_download_book.yml
262
+ - spec/fixtures/library/bo/ok/book/book.1grams.json
262
263
  - spec/fixtures/library/bo/ok/book/book.md
263
264
  - spec/fixtures/library/ot/er/other/other.md
264
265
  - spec/spec_helper.rb
@@ -296,6 +297,7 @@ specification_version: 3
296
297
  summary: Wordtree common library code
297
298
  test_files:
298
299
  - spec/fixtures/cassettes/archive_org_download_book.yml
300
+ - spec/fixtures/library/bo/ok/book/book.1grams.json
299
301
  - spec/fixtures/library/bo/ok/book/book.md
300
302
  - spec/fixtures/library/ot/er/other/other.md
301
303
  - spec/spec_helper.rb