RubyGems - greeb - Versions diffs - 0.2.0.rc1 → 0.2.0.rc2 - Mend

greeb 0.2.0.rc1 → 0.2.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
-  data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
+  metadata.gz: 78e6a2b607a8690b2fe171665e35272efd31b2ac
+  data.tar.gz: 4517bb06cc8e1f8b0be5fc47bca4bfeda0fcfd49
 SHA512:
-  metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
-  data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973
+  metadata.gz: dd24765af042566249e0c8d3153aee640c626b8f838fdda03c7b6598ad30bd0935df47ad1d1c6658ccf0e4fd598a94052c99dee127becb2f9c59b3e8dafe1cf0
+  data.tar.gz: 9eca1a25e8837732827a282d3d532db35113620372a0bde41b64669cb3fd696cc4c149267e19a5cfe58df7367b2932942554ea88cfd26826c73db926ad6ca89d

data/README.md CHANGED Viewed

@@ -129,6 +129,7 @@ pp segmentator.extract(segmentator.sentences)
 Texts are often include some special entities such as URLs and e-mail
 addresses. Greeb can help you in these strings retrieval.
+#### URL and E-mail retrieval
 ```ruby
 text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
@@ -145,6 +146,19 @@ pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
 Please don't use Greeb in spam lists development purposes.
+#### Abbreviation retrieval
+```ruby
+text = 'Hello, G.L.H.F. everyone!'
+pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
+=begin
+[[#<struct Greeb::Entity from=7, to=15, type=:abbrev>, "G.L.H.F."]]
+=end
+```
+The algorithm is not so accurate, but still useful in many practical
+situations.
 ## Tokens
 Greeb operates with entities, tuples of *(from, to, kind)*, where
 *from* is a beginning of the entity, *to* is an ending of the entity,

data/greeb.gemspec CHANGED Viewed

@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
   s.rubyforge_project = 'greeb'
   s.add_development_dependency 'rake'
-  s.add_development_dependency 'minitest', '>= 2.11'
+  s.add_development_dependency 'minitest', '~> 5.0'
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")

data/lib/greeb/parser.rb CHANGED Viewed

@@ -13,6 +13,9 @@ module Greeb::Parser
   # A horrible e-mail pattern.
   EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
+  # Another horrible pattern. Now for abbreviations.
+  ABBREV = /\b(\p{L}\.)+/i
   # Recognize URLs in the input text. Actually, URL is obsolete standard
   # and this code should be rewritten to use the URI concept.
   #
@@ -34,6 +37,16 @@ module Greeb::Parser
     scan(text, EMAIL, :email)
   end
+  # Recognize abbreviations in the input text.
+  #
+  # @param text [String] input text.
+  #
+  # @return [Array<Greeb::Entity>] found abbreviations.
+  #
+  def abbrevs(text)
+    scan(text, ABBREV, :abbrev)
+  end
   private
   # Implementation of regexp-based {Greeb::Entity} scanner.
   #

data/lib/greeb/tokenizer.rb CHANGED Viewed

@@ -57,6 +57,20 @@ module Greeb::Tokenizer
     scanner.terminate
   end
+  # Split one line into characters array, but also combine duplicated
+  # characters.
+  #
+  # For instance, `"a b\n\n\nc"` would be transformed into the following
+  # array: `["a", " ", "b", "\n\n\n", "c"]`.
+  #
+  # @param token [String] a token to be splitted.
+  #
+  # @return [Array<String>] splitted characters.
+  #
+  def split(token)
+    token.scan(/((.|\n)\2*)/).map(&:first)
+  end
   protected
   # One iteration of the tokenization process.
   #
@@ -115,18 +129,4 @@ module Greeb::Tokenizer
       before + s.length
     end
   end
-  # Split one line into characters array, but also combine line breaks
-  # into single elements.
-  #
-  # For instance, `"a b\n\n\nc"` would be transformed into the following
-  # array: `["a", " ", "b", "\n\n\n", "c"]`.
-  #
-  # @param token [String] a token to be splitted.
-  #
-  # @return [Array<String>] splitted characters.
-  #
-  def split(token)
-    token.scan(/((.|\n)\2*)/).map(&:first)
-  end
 end

data/lib/greeb/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module Greeb
   # Version of Greeb.
   #
-  VERSION = '0.2.0.rc1'
+  VERSION = '0.2.0.rc2'
 end

data/spec/parser_spec.rb CHANGED Viewed

@@ -5,9 +5,9 @@ require_relative 'spec_helper'
 module Greeb
   describe Parser do
     let(:text) do
-      'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
+      'Hello there! My name is Vasya B. My website is: http://вася.рф/. ' \
       'And my e-mail is example@example.com! Also it is available by ' \
-      'URL: http://vasya.ru.'
+      'URL: http://vasya.ru. Also, G.L.H.F. everyone!'
     end
     describe 'URL' do
@@ -15,8 +15,8 @@ module Greeb
       it 'recognizes URLs' do
         subject.must_equal(
-          [Entity.new(46, 61, :url),
-           Entity.new(130, 145, :url)]
+          [Entity.new(48, 63, :url),
+           Entity.new(132, 147, :url)]
         )
       end
     end
@@ -26,7 +26,18 @@ module Greeb
       it 'recognizes e-mails' do
         subject.must_equal(
-          [Entity.new(80, 99, :email)]
+          [Entity.new(82, 101, :email)]
+        )
+      end
+    end
+    describe 'ABBREV' do
+      subject { Parser.abbrevs(text) }
+      it 'recognizes abbreviations' do
+        subject.must_equal(
+          [Entity.new(30, 32, :abbrev),
+           Entity.new(155, 163, :abbrev)]
         )
       end
     end

data/spec/support/invoker.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require 'open3'
 # http://dota2.ru/guides/880-invokirkhakha-sanstrajk-ni-azhydal-da/
 #
-class MiniTest::Unit::TestCase
+class MiniTest::Test
   # Quas Wex Exort.
   #
   def invoke_cache

data/spec/tokenizer_spec.rb CHANGED Viewed

@@ -79,5 +79,20 @@ module Greeb
         )
       end
     end
+    describe '.split' do
+      it 'should split characters' do
+        Tokenizer.split('loh').must_equal %w(l o h)
+      end
+      it 'should combine duplicated characters' do
+        Tokenizer.split('foo').must_equal %w(f oo)
+      end
+      it 'should also deal with line breaks' do
+        Tokenizer.split("bar\n\nbaz").must_equal(
+          [*%w(b a r), "\n\n", *%w(b a z)])
+      end
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: greeb
 version: !ruby/object:Gem::Version
-  version: 0.2.0.rc1
+  version: 0.2.0.rc2
 platform: ruby
 authors:
 - Dmitry Ustalov
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-05-05 00:00:00.000000000 Z
+date: 2013-05-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -28,16 +28,16 @@ dependencies:
   name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '2.11'
+        version: '5.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '2.11'
+        version: '5.0'
 description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
   written in Ruby.
 email: