RubyGems - greeb - Versions diffs - 0.2.0.pre3 → 0.2.0.rc1 - Mend

greeb 0.2.0.pre3 → 0.2.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 618591e00b61f1df11f98bdd045bd650d34ba863
-  data.tar.gz: 88d1b8448e98c18e6d9759e4d992d2fbea7c1d63
+  metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
+  data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
 SHA512:
-  metadata.gz: e8113e47988e80aabfc07314268a5f8220cce88edbf06bd69b35602623c0a310c3c460e300143943596decae621ee69b4909371b9f43a7d9225bceb336bf21f6
-  data.tar.gz: 7ebe3c3e0a603bf1fc0072376c3b2b544b43ae38e31e8bc5ff9e34fcaf362b8c474ba565db67363c09f10a2f4960fdb0bf7a165ee6c0b90d657b3914231cc07a
+  metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
+  data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973

data/bin/greeb CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 if File.exists? File.expand_path('../../.git', __FILE__)
-  $:.unshift File.expand_path('../../lib', __FILE__)
+  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
 end
 require 'greeb'

data/lib/greeb/segmentator.rb CHANGED Viewed

@@ -7,7 +7,7 @@ class Greeb::Segmentator
   # Sentence does not start from the separator charater, line break
   # character, and punctuation characters.
   #
-  SENTENCE_DOESNT_START = [:separ, :break, :punct, :spunct]
+  SENTENCE_DOES_NOT_START = [:separ, :break, :punct, :spunct]
   attr_reader :tokens
@@ -24,8 +24,7 @@ class Greeb::Segmentator
   # @return [Array<Greeb::Entity>] a set of sentences.
   #
   def sentences
-    detect_sentences! unless @sentences
-    @sentences
+    @sentences ||= detect_entities(new_sentence, [:punct])
   end
   # Subsentences memoization method.
@@ -33,8 +32,7 @@ class Greeb::Segmentator
   # @return [Array<Greeb::Entity>] a set of subsentences.
   #
   def subsentences
-    detect_subsentences! unless @subsentences
-    @subsentences
+    @subsentences ||= detect_entities(new_subsentence, [:punct, :spunct])
   end
   # Extract tokens from the set of sentences.
@@ -44,99 +42,75 @@ class Greeb::Segmentator
   # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
   #   sentences as keys and tokens arrays as values.
   #
-  def extract(sentences)
+  def extract(sentences, collection = tokens)
     Hash[
       sentences.map do |s|
-        [s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
-      end
-    ]
-  end
-  # Extract subsentences from the set of sentences.
-  #
-  # @param sentences [Array<Greeb::Entity>] a list of sentences.
-  #
-  # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
-  #   sentences as keys and subsentences arrays as values.
-  #
-  def subextract(sentences)
-    Hash[
-      sentences.map do |s|
-        [s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
+        [s, collection.select { |t| t.from >= s.from and t.to <= s.to }]
       end
     ]
   end
   protected
-  # Implementation of the sentence detection method. This method
-  # changes the `@sentences` ivar.
+  # Implementation of the entity detection method.
   #
-  # @return [nil] nothing.
+  # @param sample [Greeb::Entity] a sample of entity to be cloned in the
+  # process.
+  # @param stop_marks [Array<Symbol>] an array that stores the
+  # correspondent stop marks of the necessary entities.
   #
-  def detect_sentences!
-    @sentences = []
-    rest = tokens.inject(new_sentence) do |sentence, token|
-      if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
-        next sentence
-      end
-      sentence.from = token.from unless sentence.from
-      next sentence if sentence.to and sentence.to > token.to
+  # @return [Array<Greeb::Entity>] a set of entites.
+  #
+  def detect_entities(sample, stop_marks)
+    collection = []
-      if :punct == token.type
-        sentence.to = tokens.
-          select { |t| t.from >= token.from }.
-          inject(token) { |r, t| break r if t.type != token.type; t }.to
+    rest = tokens.inject(sample.dup) do |entity, token|
+      next entity if sentence_does_not_start? entity, token
+      entity.from = token.from unless entity.from
+      next entity if entity.to and entity.to > token.to
-        @sentences << sentence
-        sentence = new_sentence
+      if stop_marks.include? token.type
+        entity.to = find_forward(tokens, token).to
+        collection << entity
+        entity = sample.dup
       elsif :separ != token.type
-        sentence.to = token.to
+        entity.to = token.to
       end
-      sentence
+      entity
     end
-    nil.tap { @sentences << rest if rest.from && rest.to }
+    if rest.from && rest.to
+      collection << rest
+    else
+      collection
+    end
   end
-  # Implementation of the subsentence detection method. This method
-  # changes the `@subsentences` ivar.
+  private
+  # Check the possibility of starting a new sentence by the specified
+  # pair of entity and token.
   #
-  # @return [nil] nothing.
+  # @param entity [Greeb::Entity] an entity to be checked.
+  # @param token [Greeb::Entity] an token to be checked.
   #
-  def detect_subsentences!
-    @subsentences = SortedSet.new
-    rest = tokens.inject(new_subsentence) do |subsentence, token|
-      if !subsentence.from && SENTENCE_DOESNT_START.include?(token.type)
-        next subsentence
-      end
-      subsentence.from = token.from unless subsentence.from
-      next subsentence if subsentence.to && subsentence.to > token.to
-      if [:punct, :spunct].include? token.type
-        subsentence.to = tokens.
-          select { |t| t.from >= token.from }.
-          inject(token) { |r, t| break r if t.type != token.type; t }.to
-        @subsentences << subsentence
-        subsentence = new_subsentence
-      elsif :separ != token.type
-        subsentence.to = token.to
-      end
-      subsentence
-    end
+  # @return true or false.
+  #
+  def sentence_does_not_start?(entity, token)
+    !entity.from and SENTENCE_DOES_NOT_START.include? token.type
+  end
-    nil.tap { @subsentences << rest if rest.from && rest.to }
+  # Find a forwarding token that has another type.
+  #
+  # @param collection [Array<Greeb::Entity>] array of possible tokens.
+  # @param sample [Greeb::Entity] a token that is treated as a sample.
+  #
+  # @return [Greeb::Entity] a forwarding token.
+  #
+  def find_forward(collection, sample)
+    collection.select { |t| t.from >= sample.from }.
+      inject(sample) { |r, t| t.type == sample.type ? t : (break r) }
   end
-  private
   # Create a new instance of {Greeb::Entity} with `:sentence` type.
   #
   # @return [Greeb::Entity] a new entity instance.

data/lib/greeb/tokenizer.rb CHANGED Viewed

@@ -49,14 +49,7 @@ module Greeb::Tokenizer
     scanner = Greeb::StringScanner.new(text)
     tokens = []
     while !scanner.eos?
-      parse! scanner, tokens, LETTERS, :letter or
-      parse! scanner, tokens, FLOATS, :float or
-      parse! scanner, tokens, INTEGERS, :integer or
-      split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
-      split_parse! scanner, tokens, PUNCTUATIONS, :punct or
-      split_parse! scanner, tokens, SEPARATORS, :separ or
-      split_parse! scanner, tokens, BREAKS, :break or
-      parse! scanner, tokens, RESIDUALS, :residual or
+      step scanner, tokens or
       raise Greeb::UnknownEntity.new(text, scanner.char_pos)
     end
     tokens
@@ -64,7 +57,25 @@ module Greeb::Tokenizer
     scanner.terminate
   end
-  private
+  protected
+  # One iteration of the tokenization process.
+  #
+  # @param scanner [Greeb::StringScanner] string scanner.
+  # @param tokens [Array<Greeb::Entity>] result array.
+  #
+  # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
+  #
+  def step scanner, tokens
+    parse! scanner, tokens, LETTERS, :letter or
+    parse! scanner, tokens, FLOATS, :float or
+    parse! scanner, tokens, INTEGERS, :integer or
+    split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
+    split_parse! scanner, tokens, PUNCTUATIONS, :punct or
+    split_parse! scanner, tokens, SEPARATORS, :separ or
+    split_parse! scanner, tokens, BREAKS, :break or
+    parse! scanner, tokens, RESIDUALS, :residual
+  end
   # Try to parse one small piece of text that is covered by pattern
   # of necessary type.
   #
@@ -99,9 +110,23 @@ module Greeb::Tokenizer
   def split_parse! scanner, tokens, pattern, type
     return false unless token = scanner.scan(pattern)
     position = scanner.char_pos - token.length
-    token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
+    split(token).inject(position) do |before, s|
       tokens << Greeb::Entity.new(before, before + s.length, type)
       before + s.length
     end
   end
+  # Split one line into characters array, but also combine line breaks
+  # into single elements.
+  #
+  # For instance, `"a b\n\n\nc"` would be transformed into the following
+  # array: `["a", " ", "b", "\n\n\n", "c"]`.
+  #
+  # @param token [String] a token to be splitted.
+  #
+  # @return [Array<String>] splitted characters.
+  #
+  def split(token)
+    token.scan(/((.|\n)\2*)/).map(&:first)
+  end
 end

data/lib/greeb/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module Greeb
   # Version of Greeb.
   #
-  VERSION = '0.2.0.pre3'
+  VERSION = '0.2.0.rc1'
 end

data/spec/segmentator_spec.rb CHANGED Viewed

@@ -72,8 +72,9 @@ module Greeb
     describe 'sentence extractor' do
       let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
       let(:segmentator) { Segmentator.new(tokens) }
+      let(:sentences) { segmentator.sentences }
-      subject { segmentator.extract(segmentator.sentences) }
+      subject { segmentator.extract(sentences) }
       it 'should be extracted' do
         subject.must_equal(
@@ -98,8 +99,10 @@ module Greeb
     describe 'subsentence extractor' do
       let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
       let(:segmentator) { Segmentator.new(tokens) }
+      let(:sentences) { segmentator.sentences }
+      let(:subsentences) { segmentator.subsentences }
-      subject { segmentator.subextract(segmentator.sentences) }
+      subject { segmentator.extract(sentences, subsentences) }
       it 'should extract subsentences' do
         subject.must_equal(

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: greeb
 version: !ruby/object:Gem::Version
-  version: 0.2.0.pre3
+  version: 0.2.0.rc1
 platform: ruby
 authors:
 - Dmitry Ustalov
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-30 00:00:00.000000000 Z
+date: 2013-05-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -88,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: 1.3.1
 requirements: []
 rubyforge_project: greeb
-rubygems_version: 2.0.0
+rubygems_version: 2.0.3
 signing_key:
 specification_version: 4
 summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
@@ -99,4 +99,3 @@ test_files:
 - spec/spec_helper.rb
 - spec/support/invoker.rb
 - spec/tokenizer_spec.rb
-has_rdoc: