RubyGems - greeb - Versions diffs - 0.2.0.rc1 → 0.2.0.rc2 - Mend

greeb 0.2.0.rc1 → 0.2.0.rc2

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
-  data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
+  metadata.gz: 78e6a2b607a8690b2fe171665e35272efd31b2ac
+  data.tar.gz: 4517bb06cc8e1f8b0be5fc47bca4bfeda0fcfd49
 SHA512:
-  metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
-  data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973
+  metadata.gz: dd24765af042566249e0c8d3153aee640c626b8f838fdda03c7b6598ad30bd0935df47ad1d1c6658ccf0e4fd598a94052c99dee127becb2f9c59b3e8dafe1cf0
+  data.tar.gz: 9eca1a25e8837732827a282d3d532db35113620372a0bde41b64669cb3fd696cc4c149267e19a5cfe58df7367b2932942554ea88cfd26826c73db926ad6ca89d

data/README.md CHANGED Viewed

@@ -129,6 +129,7 @@ pp segmentator.extract(segmentator.sentences)
 Texts are often include some special entities such as URLs and e-mail
 addresses. Greeb can help you in these strings retrieval.
+#### URL and E-mail retrieval
 ```ruby
 text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
@@ -145,6 +146,19 @@ pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
 Please don't use Greeb in spam lists development purposes.
+#### Abbreviation retrieval
+```ruby
+text = 'Hello, G.L.H.F. everyone!'
+pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
+=begin
+[[#<struct Greeb::Entity from=7, to=15, type=:abbrev>, "G.L.H.F."]]
+=end
+```
+The algorithm is not so accurate, but still useful in many practical
+situations.
 ## Tokens
 Greeb operates with entities, tuples of *(from, to, kind)*, where
 *from* is a beginning of the entity, *to* is an ending of the entity,

data/greeb.gemspec CHANGED Viewed

@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
   s.rubyforge_project = 'greeb'
   s.add_development_dependency 'rake'
-  s.add_development_dependency 'minitest', '>= 2.11'
+  s.add_development_dependency 'minitest', '~> 5.0'
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")

data/lib/greeb/parser.rb CHANGED Viewed

@@ -13,6 +13,9 @@ module Greeb::Parser
   # A horrible e-mail pattern.
   EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
+  # Another horrible pattern. Now for abbreviations.
+  ABBREV = /\b(\p{L}\.)+/i
   # Recognize URLs in the input text. Actually, URL is obsolete standard
   # and this code should be rewritten to use the URI concept.
   #
@@ -34,6 +37,16 @@ module Greeb::Parser
     scan(text, EMAIL, :email)
   end
+  # Recognize abbreviations in the input text.
+  #
+  # @param text [String] input text.
+  #
+  # @return [Array<Greeb::Entity>] found abbreviations.
+  #
+  def abbrevs(text)
+    scan(text, ABBREV, :abbrev)
+  end
   private
   # Implementation of regexp-based {Greeb::Entity} scanner.
   #

data/lib/greeb/tokenizer.rb CHANGED Viewed

@@ -57,6 +57,20 @@ module Greeb::Tokenizer
     scanner.terminate
   end
+  # Split one line into characters array, but also combine duplicated
+  # characters.
+  #
+  # For instance, `"a b\n\n\nc"` would be transformed into the following
+  # array: `["a", " ", "b", "\n\n\n", "c"]`.
+  #
+  # @param token [String] a token to be splitted.
+  #
+  # @return [Array<String>] splitted characters.
+  #
+  def split(token)
+    token.scan(/((.|\n)\2*)/).map(&:first)
+  end
   protected
   # One iteration of the tokenization process.
   #
@@ -115,18 +129,4 @@ module Greeb::Tokenizer
       before + s.length
     end
   end
-  # Split one line into characters array, but also combine line breaks
-  # into single elements.
-  #
-  # For instance, `"a b\n\n\nc"` would be transformed into the following
-  # array: `["a", " ", "b", "\n\n\n", "c"]`.
-  #
-  # @param token [String] a token to be splitted.
-  #
-  # @return [Array<String>] splitted characters.
-  #
-  def split(token)
-    token.scan(/((.|\n)\2*)/).map(&:first)
-  end
 end

data/lib/greeb/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module Greeb
   # Version of Greeb.
   #
-  VERSION = '0.2.0.rc1'
+  VERSION = '0.2.0.rc2'
 end

data/spec/parser_spec.rb CHANGED Viewed

@@ -5,9 +5,9 @@ require_relative 'spec_helper'
 module Greeb
   describe Parser do
     let(:text) do
-      'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
+      'Hello there! My name is Vasya B. My website is: http://вася.рф/. ' \
       'And my e-mail is example@example.com! Also it is available by ' \
-      'URL: http://vasya.ru.'
+      'URL: http://vasya.ru. Also, G.L.H.F. everyone!'
     end
     describe 'URL' do
@@ -15,8 +15,8 @@ module Greeb
       it 'recognizes URLs' do
         subject.must_equal(
-          [Entity.new(46, 61, :url),
-           Entity.new(130, 145, :url)]
+          [Entity.new(48, 63, :url),
+           Entity.new(132, 147, :url)]
         )
       end
     end
@@ -26,7 +26,18 @@ module Greeb
       it 'recognizes e-mails' do
         subject.must_equal(
-          [Entity.new(80, 99, :email)]
+          [Entity.new(82, 101, :email)]
+        )
+      end
+    end
+    describe 'ABBREV' do
+      subject { Parser.abbrevs(text) }
+      it 'recognizes abbreviations' do
+        subject.must_equal(
+          [Entity.new(30, 32, :abbrev),
+           Entity.new(155, 163, :abbrev)]
         )
       end
     end

data/spec/support/invoker.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require 'open3'
 # http://dota2.ru/guides/880-invokirkhakha-sanstrajk-ni-azhydal-da/
 #
-class MiniTest::Unit::TestCase
+class MiniTest::Test
   # Quas Wex Exort.
   #
   def invoke_cache

data/spec/tokenizer_spec.rb CHANGED Viewed

@@ -79,5 +79,20 @@ module Greeb
         )
       end
     end
+    describe '.split' do
+      it 'should split characters' do
+        Tokenizer.split('loh').must_equal %w(l o h)
+      end
+      it 'should combine duplicated characters' do
+        Tokenizer.split('foo').must_equal %w(f oo)
+      end
+      it 'should also deal with line breaks' do
+        Tokenizer.split("bar\n\nbaz").must_equal(
+          [*%w(b a r), "\n\n", *%w(b a z)])
+      end
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: greeb
 version: !ruby/object:Gem::Version
-  version: 0.2.0.rc1
+  version: 0.2.0.rc2
 platform: ruby
 authors:
 - Dmitry Ustalov
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-05-05 00:00:00.000000000 Z
+date: 2013-05-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -28,16 +28,16 @@ dependencies:
   name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '2.11'
+        version: '5.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '2.11'
+        version: '5.0'
 description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
   written in Ruby.
 email: