RubyGems - greeb - Versions diffs - 0.2.2.1 → 0.2.3 - Mend

greeb 0.2.2.1 → 0.2.3

Files changed (16) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2e350306a429635314d151614f03ef4a0edd6887
-  data.tar.gz: 99aa7bb04b04589adbb1f62aacf2733071d14b53
+  metadata.gz: 07673b32254cd2b0ab0edf0664fa59e46231dbe3
+  data.tar.gz: f8eaac92c0fd4d7dda99c4441117b5e2b34c5caa
 SHA512:
-  metadata.gz: ae38f8a918d857283183bde7f003f44b88be59fa67213be58fb9267e2fb51853fba7422918fb1c133c526c401e0baabae7c83ed753ffc38f887a56055884d74e
-  data.tar.gz: a4d66506d58d10211aa776606f022fda66c038c7c209d3df92e11bf229bf50b990e6c05351287408fe6acce69386f6e18013730563233fbecbc01ebe577471d3
+  metadata.gz: ebfda44f713c3dcda9df0439f073a1c075360c14cc41e99c400db8eec25f06033ba2d788b5c4b7b8715eeb5cc085e6c704f6e9bad08bfd6af7b4bc4d051a8c32
+  data.tar.gz: cb60f13ddad1e17a7cdbbd19add866677f92590cb7072dd5dc3cf70b80c93a31e0e857366908ff214bb0cc5b2c585415abd78338bd8479ea3150ebf58f5d2117

data/LICENSE CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2010-2013 Dmitry Ustalov
+Copyright (c) 2010-2014 Dmitry Ustalov
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED

@@ -1,7 +1,7 @@
 # Greeb
 Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
-that is based on regular expressions. API documentation is available at
-<http://rubydoc.info/github/dmchk/greeb/master/frames>.
+that is based on regular expressions. The API documentation is available
+at <http://rubydoc.info/github/dmchk/greeb/master/frames>.
 ## Installation
 Add this line to your application's Gemfile:
@@ -134,12 +134,12 @@ addresses. Greeb can help you in these strings retrieval.
 ```ruby
 text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
-pp Greeb::Parser.urls(text).map { |e| [e, text[e.from...e.to]] }
+pp Greeb::Parser.urls(text).map { |e| [e, e.slice(text)] }
 =begin
 [[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
 =end
-pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
+pp Greeb::Parser.emails(text).map { |e| [e, e.slice(text)] }
 =begin
 [[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
 =end
@@ -151,7 +151,7 @@ Please don't use Greeb in spam lists development purposes.
 ```ruby
 text = 'Hello, G.L.H.F. everyone!'
-pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
+pp Greeb::Parser.abbrevs(text).map { |e| [e, e.slice(text)] }
 =begin
 [[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
 =end
@@ -164,7 +164,7 @@ situations.
 ```ruby
 text = 'Our time is running out: 13:37 or 14:89.'
-pp Greeb::Parser.time(text).map { |e| [e, text[e.from...e.to]] }
+pp Greeb::Parser.time(text).map { |e| [e, e.slice(text)] }
 =begin
 [[#<struct Greeb::Span from=25, to=30, type=:time>, "13:37"]]
 =end
@@ -194,6 +194,6 @@ There are several span types at the tokenization stage: `:letter`,
 ## Copyright
-Copyright (c) 2010-2013 [Dmitry Ustalov]. See LICENSE for details.
+Copyright (c) 2010-2014 [Dmitry Ustalov]. See LICENSE for details.
-[Dmitry Ustalov]: http://eveel.ru
+[Dmitry Ustalov]: http://ustalov.name/

data/bin/greeb CHANGED

@@ -1,13 +1,45 @@
 #!/usr/bin/env ruby
+require 'ostruct'
+require 'optparse'
 if File.exists? File.expand_path('../../.git', __FILE__)
   $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
 end
 require 'greeb'
-text = STDIN.read.tap(&:chomp!)
+options = OpenStruct.new(input: STDIN, output: STDOUT)
+optparse = OptionParser.new do |opts|
+  opts.banner = 'Usage: %s [options] command' % $PROGRAM_NAME
+  opts.on '-i', '--input [file]', 'Input file' do |input|
+    options.input = File.open(input)
+    at_exit { options.input.close }
+  end
+  opts.on '-o', '--output [file]', 'Output file' do |output|
+    options.output = File.open(output)
+    at_exit { options.output.close }
+  end
+  opts.on_tail '-h', '--help', 'Just display this help' do
+    puts opts
+    exit
+  end
+  opts.on_tail '-v', '--version', 'Just print the version infomation' do
+    puts 'Greeb %s' % Greeb::VERSION
+    exit
+  end
+end
+optparse.parse!
+text = options.input.read.tap(&:chomp!)
 Greeb[text].each do |span|
-  puts text[span.from...span.to] unless [:space, :break].include? span.type
+  next if [:space, :break].include? span.type
+  options.output.puts text[span.from...span.to]
 end

data/lib/greeb.rb CHANGED

@@ -1,5 +1,3 @@
-# encoding: utf-8
 require 'greeb/version'
 require 'greeb/exceptions'
 require 'greeb/span'

data/lib/greeb/core.rb CHANGED

@@ -11,7 +11,8 @@ module Greeb::Core
   # Recognize e-mail addresses in the input text.
   #
-  # @param text [String] input text.
+  # @param text [String] an input text.
+  # @param helpers [Array<Symbol>] a set of helper identifiers.
   #
   # @return [Array<Greeb::Span>] a set of tokens.
   #
@@ -27,7 +28,6 @@ module Greeb::Core
   alias_method :'[]', :analyze
-  protected
   # Extact spans of the specified type from the input spans set.
   #
   # @param spans [Array<Greeb::Span>] input spans set.

data/lib/greeb/parser.rb CHANGED

@@ -8,20 +8,33 @@ module Greeb::Parser
   extend self
   # An URL pattern. Not so precise, but IDN-compatible.
+  #
   URL = %r{\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|/)))}i
   # A horrible e-mail pattern.
+  #
   EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
   # Another horrible pattern. Now for abbreviations.
+  #
   ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
   # This pattern matches anything that looks like HTML. Or not.
+  #
   HTML = /<(.*?)>/i
   # Time pattern.
+  #
   TIME = /\b(\d|[0-2]\d):[0-6]\d(:[0-6]\d){0,1}\b/i
+  # Apostrophe pattern.
+  #
+  APOSTROPHE = /['’]/i
+  # Together pattern.
+  #
+  TOGETHER = [:letter, :integer, :apostrophe, :together]
   # Recognize URLs in the input text. Actually, URL is obsolete standard
   # and this code should be rewritten to use the URI concept.
   #
@@ -73,6 +86,54 @@ module Greeb::Parser
     scan(text, TIME, :time)
   end
+  # Retrieve apostrophes from the tokenized text. The algorithm may be
+  # more optimal.
+  #
+  # @param text [String] input text.
+  # @param spans [Array<Greeb::Span>] already tokenized text.
+  #
+  # @return [Array<Greeb::Span>] retrieved apostrophes.
+  #
+  def apostrophes(text, spans)
+    apostrophes = scan(text, APOSTROPHE, :apostrophe)
+    return [] if apostrophes.empty?
+    apostrophes.each { |s| Greeb.extract_spans(spans, s) }.clear
+    spans.each_with_index.each_cons(3).reverse_each do |(s1, i), (s2, j), (s3, k)|
+      next unless s1 && s1.type == :letter
+      next unless s2 && s2.type == :apostrophe
+      next unless !s3 || s3 && s3.type == :letter
+      s3, k = s2, j unless s3
+      apostrophes << Greeb::Span.new(s1.from, s3.to, s1.type)
+      spans[i..k] = apostrophes.last
+    end
+    apostrophes
+  end
+  # Merge some spans that are together.
+  #
+  # @param spans [Array<Greeb::Span>] already tokenized text.
+  #
+  # @return [Array<Greeb::Span>] merged spans.
+  #
+  def together(spans)
+    loop do
+      converged = true
+      spans.each_with_index.each_cons(2).reverse_each do |(s1, i), (s2, j)|
+        next unless TOGETHER.include?(s1.type) && TOGETHER.include?(s2.type)
+        spans[i..j] = Greeb::Span.new(s1.from, s2.to, :together)
+        converged = false
+      end
+      break if converged
+    end
+    spans
+  end
   private
   # Implementation of regexp-based {Greeb::Span} scanner.
   #

data/lib/greeb/segmentator.rb CHANGED

@@ -55,12 +55,12 @@ class Greeb::Segmentator
   # process.
   # @param stop_marks [Array<Symbol>] an array that stores the
   # correspondent stop marks of the necessary spans.
+  # @param collection [Array<Greeb::Span>] an initial set of spans
+  # to be populated.
   #
-  # @return [Array<Greeb::Span>] a set of entites.
+  # @return [Array<Greeb::Span>] a modified collection.
   #
-  def detect_spans(sample, stop_marks)
-    collection = []
+  def detect_spans(sample, stop_marks, collection = [])
     rest = tokens.inject(sample.dup) do |span, token|
       next span if sentence_aint_start? span, token
       span.from = token.from unless span.from
@@ -77,11 +77,7 @@ class Greeb::Segmentator
       span
     end
-    if rest.from && rest.to
-      collection << rest
-    else
-      collection
-    end
+    rest.from && rest.to ? collection << rest : collection
   end
   private

data/lib/greeb/span.rb CHANGED

@@ -19,6 +19,16 @@ class Greeb::Span < Struct.new(:from, :to, :type)
     Struct.new(*self.members, *members)
   end
+  # Select the slice of the given text using coorinates of this span.
+  #
+  # @param text [String] a text to be extracted.
+  #
+  # @return [String] the retrieved substring.
+  #
+  def slice(text)
+    text[from...to]
+  end
   # @private
   def <=> other
     if (comparison = self.from <=> other.from) == 0

data/lib/greeb/tokenizer.rb CHANGED

@@ -47,6 +47,8 @@ module Greeb::Tokenizer
   # Perform the tokenization process.
   #
+  # @param text [String] a text to be tokenized.
+  #
   # @return [Array<Greeb::Span>] a set of tokens.
   #
   def tokenize text

data/lib/greeb/version.rb CHANGED

@@ -5,5 +5,5 @@
 module Greeb
   # Version of Greeb.
   #
-  VERSION = '0.2.2.1'
+  VERSION = '0.2.3'
 end

data/spec/bin_spec.rb CHANGED

@@ -21,4 +21,12 @@ describe 'CLI' do
     invoke(stdin: 'Hello example@example.com guys!').must_equal(
       %w(Hello example@example.com guys !))
   end
+  it 'should print version' do
+    invoke('-v').join.must_match(/\AGreeb (\d\.)+\d\z/)
+  end
+  it 'should print help' do
+    invoke('-h').join.must_match(/Usage/)
+  end
 end

data/spec/parser_spec.rb CHANGED

@@ -8,9 +8,11 @@ describe Parser do
      'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
      'example@example.com! It is available by URL: http://vasya.ru. '  \
      'Also, <b>G.L.H.F.</b> everyone! It\'s 13:37 or 00:02:28 right '  \
-     'now, not 14:89.').freeze
+     'now, not 14:89. What about some Nagibator228?').freeze
   end
+  let(:spans) { Tokenizer.tokenize(text) }
   describe 'URL' do
     subject { Parser.urls(text) }
@@ -67,4 +69,24 @@ describe Parser do
       )
     end
   end
+  describe 'APOSTROPHE' do
+    subject { Parser.apostrophes(text, spans.dup) }
+    it 'recognizes apostrophes' do
+      subject.must_equal(
+        [Span.new(220, 224, :letter)]
+      )
+    end
+  end
+  describe 'TOGETHER' do
+    subject { Parser.together(spans.dup) }
+    it 'merges connected spans' do
+      subject.select { |s| s.type == :together }.must_equal(
+        [Span.new(281, 293, :together)]
+      )
+    end
+  end
 end

data/spec/span_spec.rb CHANGED

@@ -60,4 +60,14 @@ describe Span do
       Span.new(1, 2, 3).wont_equal Span.new(1, 2, 4)
     end
   end
+  describe 'slicing' do
+    let(:text) { 'test228' }
+    subject { Span.new(4, 7) }
+    it 'should extract slices using #slice' do
+      subject.slice(text).must_equal '228'
+    end
+  end
 end

data/spec/support/invoker.rb CHANGED

@@ -19,11 +19,15 @@ class MiniTest::Test
     arguments = argv.dup
     options = (arguments.last.is_a? Hash) ? arguments.pop : {}
     executable = File.expand_path('../../../bin/greeb', __FILE__)
+    status = nil
-    Open3.popen3(executable, *arguments) do |i, o, *_|
+    Open3.popen3(executable, *arguments) do |i, o, _, t|
       i.puts options[:stdin] if options[:stdin]
       i.close
       invoke_cache[argv] = o.readlines.map(&:chomp!)
+      status = t.value
     end
+    invoke_cache[argv] if status.success?
   end
 end

metadata CHANGED

@@ -1,27 +1,27 @@
 --- !ruby/object:Gem::Specification
 name: greeb
 version: !ruby/object:Gem::Version
-  version: 0.2.2.1
+  version: 0.2.3
 platform: ruby
 authors:
 - Dmitry Ustalov
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-11-15 00:00:00.000000000 Z
+date: 2014-05-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '5.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '5.0'
 description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
@@ -33,8 +33,8 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
-- .travis.yml
+- ".gitignore"
+- ".travis.yml"
 - Gemfile
 - LICENSE
 - README.md
@@ -68,17 +68,17 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project: greeb
-rubygems_version: 2.1.9
+rubygems_version: 2.2.2
 signing_key:
 specification_version: 4
 summary: Greeb is a simple Unicode-aware regexp-based tokenizer.