RubyGems - greeb - Versions diffs - 0.2.2.rc1 → 0.2.2.rc2 - Mend

greeb 0.2.2.rc1 → 0.2.2.rc2

Files changed (18) hide show

@@ -0,0 +1,36 @@
+# Greeb operates with spans. A span is a tuple of *(from, to, kind)*, where
+# *from* is a beginning of the span, *to* is an ending of the span,
+# and *kind* is a type of the span.
+#
+# There are several span types: `:letter` for letters, `:float` for
+# floating point decimals, `:integer` for numbers, `:separ` for separators,
+# `:punct` for punctuation characters, `:spunct` for in-sentence punctuation
+# characters, `:space` for spaces, and `:break` for line endings.
+#
+class Greeb::Span < Struct.new(:from, :to, :type)
+  # Create a deriviative structure that is based on Greeb::Span
+  # members. Useful in integrating with Greeb.
+  #
+  # @param members [Array<Symbol>] additional members.
+  #
+  # @return [Struct] a new structure.
+  #
+  def self.derivate(*members)
+    Struct.new(*self.members, *members)
+  end
+  # @private
+  def <=> other
+    if (comparison = self.from <=> other.from) == 0
+      self.to <=> other.to
+    else
+      comparison
+    end
+  end
+  # @private
+  def eql? other
+    return false unless type == other.type
+    (self <=> other) == 0
+  end
+end

data/lib/greeb/tokenizer.rb CHANGED

@@ -35,7 +35,7 @@ module Greeb::Tokenizer
   # Spaces (i.e.: " " or &nbsp).
   #
-  SPACES = /[\p{Zs}]+/u
+  SPACES = /[\p{Zs}\t]+/u
   # Line breaks.
   #
@@ -47,14 +47,14 @@ module Greeb::Tokenizer
   # Perform the tokenization process.
   #
-  # @return [Array<Greeb::Entity>] a set of tokens.
+  # @return [Array<Greeb::Span>] a set of tokens.
   #
   def tokenize text
     scanner = Greeb::StringScanner.new(text)
     tokens = []
     while !scanner.eos?
       step scanner, tokens or
-      raise Greeb::UnknownEntity.new(text, scanner.char_pos)
+      raise Greeb::UnknownSpan.new(text, scanner.char_pos)
     end
     tokens
   ensure
@@ -79,9 +79,9 @@ module Greeb::Tokenizer
   # One iteration of the tokenization process.
   #
   # @param scanner [Greeb::StringScanner] string scanner.
-  # @param tokens [Array<Greeb::Entity>] result array.
+  # @param tokens [Array<Greeb::Span>] result array.
   #
-  # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
+  # @return [Array<Greeb::Span>] the modified set of extracted tokens.
   #
   def step scanner, tokens
     parse! scanner, tokens, LETTERS, :letter or
@@ -99,17 +99,17 @@ module Greeb::Tokenizer
   # of necessary type.
   #
   # @param scanner [Greeb::StringScanner] string scanner.
-  # @param tokens [Array<Greeb::Entity>] result array.
+  # @param tokens [Array<Greeb::Span>] result array.
   # @param pattern [Regexp] a regular expression to extract the token.
   # @param type [Symbol] a symbol that represents the necessary token
   #   type.
   #
-  # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
+  # @return [Array<Greeb::Span>] the modified set of extracted tokens.
   #
   def parse! scanner, tokens, pattern, type
     return false unless token = scanner.scan(pattern)
     position = scanner.char_pos
-    tokens << Greeb::Entity.new(position - token.length,
+    tokens << Greeb::Span.new(position - token.length,
                                 position,
                                 type)
   end
@@ -119,18 +119,18 @@ module Greeb::Tokenizer
   # characters.
   #
   # @param scanner [Greeb::StringScanner] string scanner.
-  # @param tokens [Array<Greeb::Entity>] result array.
+  # @param tokens [Array<Greeb::Span>] result array.
   # @param pattern [Regexp] a regular expression to extract the token.
   # @param type [Symbol] a symbol that represents the necessary token
   #   type.
   #
-  # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
+  # @return [Array<Greeb::Span>] the modified set of extracted tokens.
   #
   def split_parse! scanner, tokens, pattern, type
     return false unless token = scanner.scan(pattern)
     position = scanner.char_pos - token.length
     split(token).inject(position) do |before, s|
-      tokens << Greeb::Entity.new(before, before + s.length, type)
+      tokens << Greeb::Span.new(before, before + s.length, type)
       before + s.length
     end
   end

data/lib/greeb/version.rb CHANGED

@@ -5,5 +5,5 @@
 module Greeb
   # Version of Greeb.
   #
-  VERSION = '0.2.2.rc1'
+  VERSION = '0.2.2.rc2'
 end

data/spec/core_spec.rb CHANGED

@@ -2,41 +2,39 @@
 require_relative 'spec_helper'
-module Greeb
-  describe Greeb do
-    it 'should do nothing when ran without input' do
-      Greeb[''].must_be_empty
-    end
+describe Greeb do
+  it 'should do nothing when ran without input' do
+    Greeb[''].must_be_empty
+  end
-    it 'should tokenize text when input is given' do
-      Greeb['Hello guys!'].must_equal(
-        [Entity.new(0, 5, :letter),
-         Entity.new(5, 6, :space),
-         Entity.new(6, 10, :letter),
-         Entity.new(10, 11, :punct)]
-      )
-    end
+  it 'should tokenize text when input is given' do
+    Greeb['Hello guys!'].must_equal(
+      [Span.new(0, 5, :letter),
+       Span.new(5, 6, :space),
+       Span.new(6, 10, :letter),
+       Span.new(10, 11, :punct)]
+    )
+  end
-    it 'should extract URLs' do
-      Greeb['Hello http://nlpub.ru guys!'].must_equal(
-        [Entity.new(0, 5, :letter),
-         Entity.new(5, 6, :space),
-         Entity.new(6, 21, :url),
-         Entity.new(21, 22, :space),
-         Entity.new(22, 26, :letter),
-         Entity.new(26, 27, :punct)]
-      )
-    end
+  it 'should extract URLs' do
+    Greeb['Hello http://nlpub.ru guys!'].must_equal(
+      [Span.new(0, 5, :letter),
+       Span.new(5, 6, :space),
+       Span.new(6, 21, :url),
+       Span.new(21, 22, :space),
+       Span.new(22, 26, :letter),
+       Span.new(26, 27, :punct)]
+    )
+  end
-    it 'should extract e-mails' do
-      Greeb['Hello example@example.com guys!'].must_equal(
-        [Entity.new(0, 5, :letter),
-         Entity.new(5, 6, :space),
-         Entity.new(6, 25, :email),
-         Entity.new(25, 26, :space),
-         Entity.new(26, 30, :letter),
-         Entity.new(30, 31, :punct)]
-      )
-    end
+  it 'should extract e-mails' do
+    Greeb['Hello example@example.com guys!'].must_equal(
+      [Span.new(0, 5, :letter),
+       Span.new(5, 6, :space),
+       Span.new(6, 25, :email),
+       Span.new(25, 26, :space),
+       Span.new(26, 30, :letter),
+       Span.new(30, 31, :punct)]
+    )
   end
 end

data/spec/parser_spec.rb CHANGED

@@ -2,45 +2,57 @@
 require_relative 'spec_helper'
-module Greeb
-  describe Parser do
-    let(:text) do
-      'Hello there! My name is Vasya B. and I am к.ф.-м.н. My website is ' \
-      'http://вася.рф/. And my e-mail is example@example.com! Also it is ' \
-      'available by URL: http://vasya.ru. Also, G.L.H.F. everyone!'
+describe Parser do
+  let(:text) do
+    ('Hello there! My name is <span class="name">Vasya B.</span> and ' \
+     'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
+     'example@example.com! It is available by URL: http://vasya.ru. '  \
+     'Also, <b>G.L.H.F.</b> everyone!').freeze
+  end
+  describe 'URL' do
+    subject { Parser.urls(text) }
+    it 'recognizes URLs' do
+      subject.must_equal(
+        [Span.new(92, 107, :url),
+         Span.new(171, 186, :url)]
+      )
     end
+  end
-    describe 'URL' do
-      subject { Parser.urls(text) }
+  describe 'EMAIL' do
+    subject { Parser.emails(text) }
-      it 'recognizes URLs' do
-        subject.must_equal(
-          [Entity.new(66, 81, :url),
-           Entity.new(150, 165, :url)]
-        )
-      end
+    it 'recognizes e-mails' do
+      subject.must_equal(
+        [Span.new(126, 145, :email)]
+      )
     end
+  end
-    describe 'EMAIL' do
-      subject { Parser.emails(text) }
+  describe 'ABBREV' do
+    subject { Parser.abbrevs(text) }
-      it 'recognizes e-mails' do
-        subject.must_equal(
-          [Entity.new(100, 119, :email)]
-        )
-      end
+    it 'recognizes abbreviations' do
+      subject.must_equal(
+        [Span.new(49, 51, :abbrev),
+         Span.new(68, 77, :abbrev),
+         Span.new(197, 205, :abbrev)]
+      )
     end
+  end
-    describe 'ABBREV' do
-      subject { Parser.abbrevs(text) }
+  describe 'HTML' do
+    subject { Parser.html(text) }
-      it 'recognizes abbreviations' do
-        subject.must_equal(
-          [Entity.new(30, 32, :abbrev),
-           Entity.new(42, 51, :abbrev),
-           Entity.new(173, 181, :abbrev)]
-        )
-      end
+    it 'recognizes HTML entities' do
+      subject.must_equal(
+        [Span.new(24, 43, :html),
+         Span.new(51, 58, :html),
+         Span.new(194, 197, :html),
+         Span.new(205, 209, :html)]
+      )
     end
   end
 end

data/spec/segmentator_spec.rb CHANGED

@@ -2,116 +2,114 @@
 require_relative 'spec_helper'
-module Greeb
-  describe Segmentator do
-    describe 'initialization' do
-      let(:tokens) { Tokenizer.tokenize('Vodka') }
+describe Segmentator do
+  describe 'initialization' do
+    let(:tokens) { Tokenizer.tokenize('Vodka') }
-      subject { Segmentator.new(tokens) }
+    subject { Segmentator.new(tokens) }
-      it 'is initialized either with set of tokens' do
-        subject.tokens.must_be_kind_of Array
-      end
+    it 'is initialized either with set of tokens' do
+      subject.tokens.must_be_kind_of Array
+    end
-      it 'should has @tokens ivar' do
-        subject.instance_variable_get(:@tokens).wont_be_nil
-      end
+    it 'should has @tokens ivar' do
+      subject.instance_variable_get(:@tokens).wont_be_nil
     end
+  end
-    describe 'a simple sentence' do
-      let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
+  describe 'a simple sentence' do
+    let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
-      subject { Segmentator.new(tokens).sentences }
+    subject { Segmentator.new(tokens).sentences }
-      it 'should be segmented' do
-        subject.must_equal([Entity.new(0, 22, :sentence)])
-      end
+    it 'should be segmented' do
+      subject.must_equal([Span.new(0, 22, :sentence)])
     end
+  end
-    describe 'a simple sentence without punctuation' do
-      let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
+  describe 'a simple sentence without punctuation' do
+    let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
-      subject { Segmentator.new(tokens).sentences }
+    subject { Segmentator.new(tokens).sentences }
-      it 'should be segmented' do
-        subject.must_equal([Entity.new(0, 21, :sentence)])
-      end
+    it 'should be segmented' do
+      subject.must_equal([Span.new(0, 21, :sentence)])
     end
+  end
-    describe 'a simple sentence with trailing whitespaces' do
-      let(:tokens) { Tokenizer.tokenize('      Hello, I am JC Denton  ') }
+  describe 'a simple sentence with trailing whitespaces' do
+    let(:tokens) { Tokenizer.tokenize('      Hello, I am JC Denton  ') }
-      subject { Segmentator.new(tokens).sentences }
+    subject { Segmentator.new(tokens).sentences }
-      it 'should be segmented' do
-        subject.must_equal([Entity.new(6, 27, :sentence)])
-      end
+    it 'should be segmented' do
+      subject.must_equal([Span.new(6, 27, :sentence)])
     end
+  end
-    describe 'two simple sentences' do
-      let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
+  describe 'two simple sentences' do
+    let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
-      subject { Segmentator.new(tokens).sentences }
+    subject { Segmentator.new(tokens).sentences }
-      it 'should be segmented' do
-        subject.must_equal([Entity.new(0, 6,  :sentence),
-                            Entity.new(7, 22, :sentence)])
-      end
+    it 'should be segmented' do
+      subject.must_equal([Span.new(0, 6,  :sentence),
+                          Span.new(7, 22, :sentence)])
     end
+  end
-    describe 'one wrong character and one simple sentence' do
-      let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
+  describe 'one wrong character and one simple sentence' do
+    let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
-      subject { Segmentator.new(tokens).sentences }
+    subject { Segmentator.new(tokens).sentences }
-      it 'should be segmented' do
-        subject.must_equal([Entity.new(2, 17, :sentence)])
-      end
+    it 'should be segmented' do
+      subject.must_equal([Span.new(2, 17, :sentence)])
     end
+  end
-    describe 'sentence extractor' do
-      let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
-      let(:segmentator) { Segmentator.new(tokens) }
-      let(:sentences) { segmentator.sentences }
-      subject { segmentator.extract(sentences) }
-      it 'should be extracted' do
-        subject.must_equal(
-          Entity.new(0,  6, :sentence) => [
-            Entity.new(0, 5, :letter),
-            Entity.new(5, 6, :punct)
-          ],
-          Entity.new(7, 22, :sentence) => [
-            Entity.new(7,  8,  :letter),
-            Entity.new(8,  9,  :space),
-            Entity.new(9,  11, :letter),
-            Entity.new(11, 12, :space),
-            Entity.new(12, 14, :letter),
-            Entity.new(14, 15, :space),
-            Entity.new(15, 21, :letter),
-            Entity.new(21, 22, :punct)
-          ]
-        )
-      end
+  describe 'sentence extractor' do
+    let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
+    let(:segmentator) { Segmentator.new(tokens) }
+    let(:sentences) { segmentator.sentences }
+    subject { segmentator.extract(sentences) }
+    it 'should be extracted' do
+      subject.must_equal([
+        [Span.new(0,  6, :sentence), [
+          Span.new(0, 5, :letter),
+          Span.new(5, 6, :punct)
+        ]],
+        [Span.new(7, 22, :sentence), [
+          Span.new(7,  8,  :letter),
+          Span.new(8,  9,  :space),
+          Span.new(9,  11, :letter),
+          Span.new(11, 12, :space),
+          Span.new(12, 14, :letter),
+          Span.new(14, 15, :space),
+          Span.new(15, 21, :letter),
+          Span.new(21, 22, :punct)
+        ]]
+      ])
     end
+  end
-    describe 'subsentence extractor' do
-      let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
-      let(:segmentator) { Segmentator.new(tokens) }
-      let(:sentences) { segmentator.sentences }
-      let(:subsentences) { segmentator.subsentences }
-      subject { segmentator.extract(sentences, subsentences) }
-      it 'should extract subsentences' do
-        subject.must_equal(
-          Entity.new(0,  22, :sentence) => [
-            Entity.new(0, 6, :subsentence),
-            Entity.new(7, 22, :subsentence)
-          ]
-        )
-      end
+  describe 'subsentence extractor' do
+    let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
+    let(:segmentator) { Segmentator.new(tokens) }
+    let(:sentences) { segmentator.sentences }
+    let(:subsentences) { segmentator.subsentences }
+    subject { segmentator.extract(sentences, subsentences) }
+    it 'should extract subsentences' do
+      subject.must_equal([
+        [Span.new(0,  22, :sentence), [
+          Span.new(0, 6, :subsentence),
+          Span.new(7, 22, :subsentence)
+        ]]
+      ])
     end
   end
 end