RubyGems - wordtree - Versions diffs - 0.2.2 → 0.3.0 - Mend

wordtree 0.2.2 → 0.3.0

Files changed (11) hide show

data/lib/wordtree/book.rb +38 -1
data/lib/wordtree/disk/librarian.rb +31 -2
data/lib/wordtree/disk/library.rb +8 -5
data/lib/wordtree/text_utils.rb +31 -0
data/lib/wordtree/version.rb +1 -1
data/spec/fixtures/library/bo/ok/book/book.1grams.json +3 -0
data/spec/wordtree/book_spec.rb +43 -0
data/spec/wordtree/disk/librarian_spec.rb +22 -1
data/spec/wordtree/disk/library_spec.rb +6 -0
data/spec/wordtree/text_utils_spec.rb +30 -4
metadata +3 -1

data/lib/wordtree/book.rb CHANGED Viewed

@@ -21,6 +21,11 @@ module WordTree
     attribute :content, String
+    def initialize(*args)
+      super
+      @ngrams = {}
+    end
     def self.create(id, metadata, content)
       new(metadata.merge("id" => id, "content" => content))
     end
@@ -34,13 +39,45 @@ module WordTree
     end
     def content_clean(wrap=120)
-      TextUtils.clean_text(content, wrap)
+      if @content_clean_wrap != wrap
+        # Memoize content_clean (using last wrap size)
+        @content_clean_wrap = wrap
+        @content_clean = TextUtils.clean_text(content, wrap)
+      end
+      @content_clean
     end
     def content_size
       content ? content.size : nil
     end
+    def each_ngram(n=1, &block)
+      TextUtils.each_ngram(content_clean, n, &block)
+    end
+    def set_ngrams(n, lookup)
+      raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
+      @ngrams[n] = lookup
+    end
+    def ngrams(n=1)
+      # Memoize ngram counts
+      @ngrams[n] ||= count_ngrams(n)
+    end
+    def all_ngrams
+      @ngrams
+    end
+    def count_ngrams(n=1)
+      {}.tap do |tally|
+        each_ngram(n) do |ngram|
+          tally[ngram] ||= 0
+          tally[ngram] += 1
+        end
+      end
+    end
     def calculate_simhash
       content ? content_clean.simhash(:split_by => /\s/) : nil
     end

data/lib/wordtree/disk/librarian.rb CHANGED Viewed

@@ -20,7 +20,7 @@ module WordTree
         end
       end
-      def find(book_id)
+      def find_without_ngrams(book_id)
         begin
           retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
           Book.create(book_id, retrieved.metadata, retrieved.content)
@@ -29,6 +29,20 @@ module WordTree
         end
       end
+      def find(book_id)
+        find_without_ngrams(book_id).tap do |book|
+          (1..9).each do |n|
+            path = library.path_to(book_id, :ngrams, :n => n)
+            if File.exist?(path)
+              File.open(path) do |f|
+                hash = JSON.load(f)
+                book.set_ngrams(n, hash)
+              end
+            end
+          end
+        end
+      end
       def each(file_suffix_re=/\.(md|txt)$/, &block)
         library.each(file_suffix_re) do |path|
           retrieved = Preamble.load(path, :external_encoding => "utf-8")
@@ -36,11 +50,26 @@ module WordTree
         end
       end
-      def save(book)
+      def save_without_ngrams(book)
         library.mkdir(book.id)
         Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
       end
+      def save_ngrams(book)
+        book.all_ngrams.each_pair do |n, hash|
+          path = library.path_to(book.id, :ngrams, :n => n)
+          File.open(path, "w") do |file|
+            file.write hash.to_json
+          end
+        end
+      end
+      def save(book)
+        save_without_ngrams(book).tap do
+          save_ngrams(book)
+        end
+      end
       def archive_org_get(*book_ids, &block)
         book_ids.map do |book_id|
           archive_org_get_with_conditions(identifier: book_id, &block)

data/lib/wordtree/disk/library.rb CHANGED Viewed

@@ -10,7 +10,8 @@ module WordTree
       include Enumerable
       FILE_TYPES = {
-        :raw => "%s.md"
+        :raw => "%{id}.md",
+        :ngrams => "%{id}.%{n}grams.json"
       }
       # The file path to the root of the library directory, e.g. /data/library
@@ -26,13 +27,15 @@ module WordTree
         File.expand_path(LibraryLocator.identity(book_id).relpath, root)
       end
-      def path_to(book_id, type=:raw)
-        File.join(dir_of(book_id), file_type(book_id, type))
+      def path_to(book_id, type=:raw, opts={})
+        File.join(dir_of(book_id), file_type(book_id, type, opts))
       end
-      def file_type(book_id, type=:raw)
+      def file_type(book_id, type=:raw, opts={})
         locator = LibraryLocator.identity(book_id)
-        FILE_TYPES[type] % locator.id
+        template = FILE_TYPES[type]
+        raise ArgumentError, "unable to find file type template #{type.inspect}" if template.nil?
+        template % {:id => locator.id}.merge(opts)
       end
       # Create all subdirs up to the location where a book is stored

data/lib/wordtree/text_utils.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'strscan'
 module WordTree
   module TextUtils
     def self.split_near(text, split_index)
@@ -34,9 +36,12 @@ module WordTree
       _dash = '-'.ord
       _space = ' '.ord
       _newline = "\n".ord
+      _period = '.'.ord
+      _question = '?'.ord
       join_lines = false
       just_added_space = false
+      just_added_period = false
       line_length = 0
       input.each_char do |c|
         c = c.ord
@@ -44,17 +49,28 @@ module WordTree
         c -= 32 if (c >= _A && c <= _Z)
         # Change newlines to spaces
         c = _space if c == _newline
+        # Change question marks to periods (i.e. both count as sentence boundaries)
+        c = _period if c == _question
         if c == _dash
           # In case of a dash, set the scoop-spaces-up flag
           join_lines = true
         elsif join_lines && (c == _space)
           # ignore
+        elsif (c == _period) && !just_added_period
+          if !just_added_space
+            output << _space.chr
+          end
+          output << c.chr
+          just_added_period = true
+          just_added_space = true
         elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
           # Add letters and spaces
+          output << _space.chr if just_added_period
           output << c.chr
           line_length += 1
           just_added_space = (c == _space)
+          just_added_period = false
           join_lines = false
         end
       end
@@ -69,5 +85,20 @@ module WordTree
       return wrapped_output
     end
+    def self.each_ngram(input, n=1, &block)
+      onegram_re = /([^ \n]+[ \n])/
+      ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
+      s = StringScanner.new(input)
+      while !s.eos?
+        if words = s.scan(ngram_re)
+          yield words.rstrip.tr("\n", " ") if block_given?
+          # Move back to beginning of n-word sequence
+          s.unscan
+        end
+        # Move forward one word
+        s.scan(onegram_re)
+      end
+    end
   end
 end

data/lib/wordtree/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Wordtree
-  VERSION = "0.2.2"
+  VERSION = "0.3.0"
 end

data/spec/fixtures/library/bo/ok/book/book.1grams.json ADDED Viewed

@@ -0,0 +1,3 @@
+{
+  "xyz": 1
+}

data/spec/wordtree/book_spec.rb CHANGED Viewed

@@ -16,4 +16,47 @@ describe WordTree::Book do
     book = WordTree::Book.create("book", {}, "Wi&ld\nContent!")
     expect(book.content_clean).to eq("wild content\n")
   end
+  context "ngrams" do
+    let(:content) { "A man. A plan. And a man."}
+    let(:book) { WordTree::Book.create("book", {}, content) }
+    let(:one_grams) { { "a" => 3, "man" => 2, "plan" => 1, "and" => 1, "." => 3 } }
+    let(:two_grams) {
+      {"a man" => 2, "man ." => 2, ". a" => 1, "a plan" => 1,
+       "plan ." => 1, ". and" => 1, "and a" => 1}
+    }
+    describe "#count_ngrams" do
+      it "creates a hash lookup table" do
+        hash = book.count_ngrams(1)
+        expect(hash).to be_a(Hash)
+      end
+      it "has counts of ngrams" do
+        hash = book.count_ngrams(1)
+        expect(hash).to eq(one_grams)
+        hash = book.count_ngrams(2)
+        expect(hash).to eq(two_grams)
+      end
+      it "memoizes ngrams" do
+        expect(book).to receive(:count_ngrams).with(1).and_return(one_grams)
+        expect(book.ngrams(1)).to eq one_grams
+        expect(book).to_not receive(:count_ngrams)
+        expect(book.ngrams(1)).to eq one_grams
+      end
+    end
+    describe "#set_ngrams" do
+      it "sets the lookup hash" do
+        book.set_ngrams(1, {"one" => 1})
+        expect(book.ngrams(1)).to eq("one" => 1)
+        expect(book.ngrams(2)).to eq(two_grams)
+      end
+      it "raises an error when not a hash" do
+        expect{ book.set_ngrams(1, "string") }.to raise_error
+        expect{ book.set_ngrams(1, nil) }.to raise_error
+      end
+    end
+  end
 end

data/spec/wordtree/disk/librarian_spec.rb CHANGED Viewed

@@ -32,6 +32,12 @@ describe WordTree::Disk::Librarian do
         expect(book.year).to eq(1800)
         expect(book.content).to eq("Book with content")
       end
+      it "loads ngrams if available" do
+        book = librarian.find("book")
+        expect(book).to_not receive(:count_ngrams)
+        expect(book.ngrams(1)).to eq("xyz" => 1)
+      end
     end
     describe "#each" do
@@ -41,12 +47,27 @@ describe WordTree::Disk::Librarian do
       end
     end
-    it "saves to disk (yaml, content)" do
+    it "saves ngrams to disk" do
       tmp_root = Dir.mktmpdir
       tmp_library = WordTree::Disk::Library.new(tmp_root)
       tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
       book = librarian.find("book")
+      book.ngrams(1)
+      book.ngrams(2)
+      tmp_librarian.save(book)
+      ngrams_filepath = tmp_library.path_to("book", :ngrams, :n => 1)
+      expect(File.exist?(ngrams_filepath)).to be_truthy
+    end
+    it "saves to disk (yaml, content)" do
+      tmp_root = Dir.mktmpdir
+      tmp_library = WordTree::Disk::Library.new(tmp_root)
+      tmp_librarian = WordTree::Disk::Librarian.new(tmp_library)
+      book = librarian.find_without_ngrams("book")
       book.source = "test"
       book.content += "."

data/spec/wordtree/disk/library_spec.rb CHANGED Viewed

@@ -31,4 +31,10 @@ describe WordTree::Disk::Library do
       end
     end
   end
+  describe "#file_type" do
+    it "interpolates n" do
+      expect(library.file_type("abc", :ngrams, :n => 1)).to eq("abc.1grams.json")
+    end
+  end
 end

data/spec/wordtree/text_utils_spec.rb CHANGED Viewed

@@ -33,8 +33,8 @@ describe WordTree::TextUtils do
   end
   context "#clean_text" do
-    let(:sample_text) { "This, [here]  is awesome, right?" }
     it "wraps" do
+      sample_text = "This, [here]  is awesome, right"
       cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
       expect(cleaned).to eq("this here\nis awesome\nright\n")
@@ -45,10 +45,36 @@ describe WordTree::TextUtils do
       expect(cleaned).to eq("this here is awesome right\n")
     end
-    let(:sample_dash) { "What-\never\ndo you\n mean?"}
     it "joins lines ending in -" do
-      cleaned = WordTree::TextUtils.clean_text(sample_dash, 10)
-      expect(cleaned).to eq("whatever\ndo you\nmean\n")
+      sample_text = "What-\never\ndo you\n mean?"
+      cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
+      expect(cleaned).to eq("whatever\ndo you\nmean .\n")
+    end
+    it "does not ignore sentence boundaries" do
+      sample_text = "This is a sentence. And so is this? Keep the dots."
+      cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
+      expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
+      cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
+      expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
+    end
+    it "compresses sentence boundary punctuation and spaces" do
+      sample_text = "words . . and.. stuff"
+      cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
+      expect(cleaned).to eq("words . and . stuff\n")
+    end
+  end
+  context "#each_ngram" do
+    it "yields ngrams in succession" do
+      sample_text = "one word\n. two\n"
+      expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
+        yield_successive_args("one", "word", ".", "two")
+      expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
+        yield_successive_args("one word", "word .", ". two")
+      expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
+        yield_successive_args("one word .", "word . two")
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wordtree
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.3.0
   prerelease:
 platform: ruby
 authors:
@@ -259,6 +259,7 @@ files:
 - lib/wordtree/text_utils.rb
 - lib/wordtree/version.rb
 - spec/fixtures/cassettes/archive_org_download_book.yml
+- spec/fixtures/library/bo/ok/book/book.1grams.json
 - spec/fixtures/library/bo/ok/book/book.md
 - spec/fixtures/library/ot/er/other/other.md
 - spec/spec_helper.rb
@@ -296,6 +297,7 @@ specification_version: 3
 summary: Wordtree common library code
 test_files:
 - spec/fixtures/cassettes/archive_org_download_book.yml
+- spec/fixtures/library/bo/ok/book/book.1grams.json
 - spec/fixtures/library/bo/ok/book/book.md
 - spec/fixtures/library/ot/er/other/other.md
 - spec/spec_helper.rb