RubyGems - greeb - Versions diffs - 0.2.2.1 → 0.2.3 - Mend

greeb 0.2.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2e350306a429635314d151614f03ef4a0edd6887
-  data.tar.gz: 99aa7bb04b04589adbb1f62aacf2733071d14b53
+  metadata.gz: 07673b32254cd2b0ab0edf0664fa59e46231dbe3
+  data.tar.gz: f8eaac92c0fd4d7dda99c4441117b5e2b34c5caa
 SHA512:
-  metadata.gz: ae38f8a918d857283183bde7f003f44b88be59fa67213be58fb9267e2fb51853fba7422918fb1c133c526c401e0baabae7c83ed753ffc38f887a56055884d74e
-  data.tar.gz: a4d66506d58d10211aa776606f022fda66c038c7c209d3df92e11bf229bf50b990e6c05351287408fe6acce69386f6e18013730563233fbecbc01ebe577471d3
+  metadata.gz: ebfda44f713c3dcda9df0439f073a1c075360c14cc41e99c400db8eec25f06033ba2d788b5c4b7b8715eeb5cc085e6c704f6e9bad08bfd6af7b4bc4d051a8c32
+  data.tar.gz: cb60f13ddad1e17a7cdbbd19add866677f92590cb7072dd5dc3cf70b80c93a31e0e857366908ff214bb0cc5b2c585415abd78338bd8479ea3150ebf58f5d2117

data/LICENSE CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2010-2013 Dmitry Ustalov
+Copyright (c) 2010-2014 Dmitry Ustalov
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED

@@ -1,7 +1,7 @@
 # Greeb
 Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
-that is based on regular expressions. API documentation is available at
-<http://rubydoc.info/github/dmchk/greeb/master/frames>.
+that is based on regular expressions. The API documentation is available
+at <http://rubydoc.info/github/dmchk/greeb/master/frames>.
 ## Installation
 Add this line to your application's Gemfile:
@@ -134,12 +134,12 @@ addresses. Greeb can help you in these strings retrieval.
 ```ruby
 text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
-pp Greeb::Parser.urls(text).map { |e| [e, text[e.from...e.to]] }
+pp Greeb::Parser.urls(text).map { |e| [e, e.slice(text)] }
 =begin
 [[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
 =end
-pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
+pp Greeb::Parser.emails(text).map { |e| [e, e.slice(text)] }
 =begin
 [[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
 =end
@@ -151,7 +151,7 @@ Please don't use Greeb in spam lists development purposes.
 ```ruby
 text = 'Hello, G.L.H.F. everyone!'
-pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
+pp Greeb::Parser.abbrevs(text).map { |e| [e, e.slice(text)] }
 =begin
 [[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
 =end
@@ -164,7 +164,7 @@ situations.
 ```ruby
 text = 'Our time is running out: 13:37 or 14:89.'
-pp Greeb::Parser.time(text).map { |e| [e, text[e.from...e.to]] }
+pp Greeb::Parser.time(text).map { |e| [e, e.slice(text)] }
 =begin
 [[#<struct Greeb::Span from=25, to=30, type=:time>, "13:37"]]
 =end
@@ -194,6 +194,6 @@ There are several span types at the tokenization stage: `:letter`,
 ## Copyright
-Copyright (c) 2010-2013 [Dmitry Ustalov]. See LICENSE for details.
+Copyright (c) 2010-2014 [Dmitry Ustalov]. See LICENSE for details.
-[Dmitry Ustalov]: http://eveel.ru
+[Dmitry Ustalov]: http://ustalov.name/

data/bin/greeb CHANGED

@@ -1,13 +1,45 @@
 #!/usr/bin/env ruby
+require 'ostruct'
+require 'optparse'
 if File.exists? File.expand_path('../../.git', __FILE__)
   $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
 end
 require 'greeb'
-text = STDIN.read.tap(&:chomp!)
+options = OpenStruct.new(input: STDIN, output: STDOUT)
+optparse = OptionParser.new do |opts|
+  opts.banner = 'Usage: %s [options] command' % $PROGRAM_NAME
+  opts.on '-i', '--input [file]', 'Input file' do |input|
+    options.input = File.open(input)
+    at_exit { options.input.close }
+  end
+  opts.on '-o', '--output [file]', 'Output file' do |output|
+    options.output = File.open(output)
+    at_exit { options.output.close }
+  end
+  opts.on_tail '-h', '--help', 'Just display this help' do
+    puts opts
+    exit
+  end
+  opts.on_tail '-v', '--version', 'Just print the version infomation' do
+    puts 'Greeb %s' % Greeb::VERSION
+    exit
+  end
+end
+optparse.parse!
+text = options.input.read.tap(&:chomp!)
 Greeb[text].each do |span|
-  puts text[span.from...span.to] unless [:space, :break].include? span.type
+  next if [:space, :break].include? span.type
+  options.output.puts text[span.from...span.to]
 end

data/lib/greeb.rb CHANGED

@@ -1,5 +1,3 @@
-# encoding: utf-8
 require 'greeb/version'
 require 'greeb/exceptions'
 require 'greeb/span'

data/lib/greeb/core.rb CHANGED

@@ -11,7 +11,8 @@ module Greeb::Core
   # Recognize e-mail addresses in the input text.
   #
-  # @param text [String] input text.
+  # @param text [String] an input text.
+  # @param helpers [Array<Symbol>] a set of helper identifiers.
   #
   # @return [Array<Greeb::Span>] a set of tokens.
   #
@@ -27,7 +28,6 @@ module Greeb::Core
   alias_method :'[]', :analyze
-  protected
   # Extact spans of the specified type from the input spans set.
   #
   # @param spans [Array<Greeb::Span>] input spans set.

data/lib/greeb/parser.rb CHANGED

@@ -8,20 +8,33 @@ module Greeb::Parser
   extend self
   # An URL pattern. Not so precise, but IDN-compatible.
+  #
   URL = %r{\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|/)))}i
   # A horrible e-mail pattern.
+  #
   EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
   # Another horrible pattern. Now for abbreviations.
+  #
   ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
   # This pattern matches anything that looks like HTML. Or not.
+  #
   HTML = /<(.*?)>/i
   # Time pattern.
+  #
   TIME = /\b(\d|[0-2]\d):[0-6]\d(:[0-6]\d){0,1}\b/i
+  # Apostrophe pattern.
+  #
+  APOSTROPHE = /['’]/i
+  # Together pattern.
+  #
+  TOGETHER = [:letter, :integer, :apostrophe, :together]
   # Recognize URLs in the input text. Actually, URL is obsolete standard
   # and this code should be rewritten to use the URI concept.
   #
@@ -73,6 +86,54 @@ module Greeb::Parser
     scan(text, TIME, :time)
   end
+  # Retrieve apostrophes from the tokenized text. The algorithm may be
+  # more optimal.
+  #
+  # @param text [String] input text.
+  # @param spans [Array<Greeb::Span>] already tokenized text.
+  #
+  # @return [Array<Greeb::Span>] retrieved apostrophes.
+  #
+  def apostrophes(text, spans)
+    apostrophes = scan(text, APOSTROPHE, :apostrophe)
+    return [] if apostrophes.empty?
+    apostrophes.each { |s| Greeb.extract_spans(spans, s) }.clear
+    spans.each_with_index.each_cons(3).reverse_each do |(s1, i), (s2, j), (s3, k)|
+      next unless s1 && s1.type == :letter
+      next unless s2 && s2.type == :apostrophe
+      next unless !s3 || s3 && s3.type == :letter
+      s3, k = s2, j unless s3
+      apostrophes << Greeb::Span.new(s1.from, s3.to, s1.type)
+      spans[i..k] = apostrophes.last
+    end
+    apostrophes
+  end
+  # Merge some spans that are together.
+  #
+  # @param spans [Array<Greeb::Span>] already tokenized text.
+  #
+  # @return [Array<Greeb::Span>] merged spans.
+  #
+  def together(spans)
+    loop do
+      converged = true
+      spans.each_with_index.each_cons(2).reverse_each do |(s1, i), (s2, j)|
+        next unless TOGETHER.include?(s1.type) && TOGETHER.include?(s2.type)
+        spans[i..j] = Greeb::Span.new(s1.from, s2.to, :together)
+        converged = false
+      end
+      break if converged
+    end
+    spans
+  end
   private
   # Implementation of regexp-based {Greeb::Span} scanner.
   #

data/lib/greeb/segmentator.rb CHANGED

@@ -55,12 +55,12 @@ class Greeb::Segmentator
   # process.
   # @param stop_marks [Array<Symbol>] an array that stores the
   # correspondent stop marks of the necessary spans.
+  # @param collection [Array<Greeb::Span>] an initial set of spans
+  # to be populated.
   #
-  # @return [Array<Greeb::Span>] a set of entites.
+  # @return [Array<Greeb::Span>] a modified collection.
   #
-  def detect_spans(sample, stop_marks)
-    collection = []
+  def detect_spans(sample, stop_marks, collection = [])
     rest = tokens.inject(sample.dup) do |span, token|
       next span if sentence_aint_start? span, token
       span.from = token.from unless span.from
@@ -77,11 +77,7 @@ class Greeb::Segmentator
       span
     end
-    if rest.from && rest.to
-      collection << rest
-    else
-      collection
-    end
+    rest.from && rest.to ? collection << rest : collection
   end
   private

data/lib/greeb/span.rb CHANGED

@@ -19,6 +19,16 @@ class Greeb::Span < Struct.new(:from, :to, :type)
     Struct.new(*self.members, *members)
   end
+  # Select the slice of the given text using coorinates of this span.
+  #
+  # @param text [String] a text to be extracted.
+  #
+  # @return [String] the retrieved substring.
+  #
+  def slice(text)
+    text[from...to]
+  end
   # @private
   def <=> other
     if (comparison = self.from <=> other.from) == 0

data/lib/greeb/tokenizer.rb CHANGED

@@ -47,6 +47,8 @@ module Greeb::Tokenizer
   # Perform the tokenization process.
   #
+  # @param text [String] a text to be tokenized.
+  #
   # @return [Array<Greeb::Span>] a set of tokens.
   #
   def tokenize text

data/lib/greeb/version.rb CHANGED

@@ -5,5 +5,5 @@
 module Greeb
   # Version of Greeb.
   #
-  VERSION = '0.2.2.1'
+  VERSION = '0.2.3'
 end

data/spec/bin_spec.rb CHANGED

@@ -21,4 +21,12 @@ describe 'CLI' do
     invoke(stdin: 'Hello example@example.com guys!').must_equal(
       %w(Hello example@example.com guys !))
   end
+  it 'should print version' do
+    invoke('-v').join.must_match(/\AGreeb (\d\.)+\d\z/)
+  end
+  it 'should print help' do
+    invoke('-h').join.must_match(/Usage/)
+  end
 end

data/spec/parser_spec.rb CHANGED

@@ -8,9 +8,11 @@ describe Parser do
      'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
      'example@example.com! It is available by URL: http://vasya.ru. '  \
      'Also, <b>G.L.H.F.</b> everyone! It\'s 13:37 or 00:02:28 right '  \
-     'now, not 14:89.').freeze
+     'now, not 14:89. What about some Nagibator228?').freeze
   end
+  let(:spans) { Tokenizer.tokenize(text) }
   describe 'URL' do
     subject { Parser.urls(text) }
@@ -67,4 +69,24 @@ describe Parser do
       )
     end
   end
+  describe 'APOSTROPHE' do
+    subject { Parser.apostrophes(text, spans.dup) }
+    it 'recognizes apostrophes' do
+      subject.must_equal(
+        [Span.new(220, 224, :letter)]
+      )
+    end
+  end
+  describe 'TOGETHER' do
+    subject { Parser.together(spans.dup) }
+    it 'merges connected spans' do
+      subject.select { |s| s.type == :together }.must_equal(
+        [Span.new(281, 293, :together)]
+      )
+    end
+  end
 end

data/spec/span_spec.rb CHANGED

@@ -60,4 +60,14 @@ describe Span do
       Span.new(1, 2, 3).wont_equal Span.new(1, 2, 4)
     end
   end
+  describe 'slicing' do
+    let(:text) { 'test228' }
+    subject { Span.new(4, 7) }
+    it 'should extract slices using #slice' do
+      subject.slice(text).must_equal '228'
+    end
+  end
 end

data/spec/support/invoker.rb CHANGED

@@ -19,11 +19,15 @@ class MiniTest::Test
     arguments = argv.dup
     options = (arguments.last.is_a? Hash) ? arguments.pop : {}
     executable = File.expand_path('../../../bin/greeb', __FILE__)
+    status = nil
-    Open3.popen3(executable, *arguments) do |i, o, *_|
+    Open3.popen3(executable, *arguments) do |i, o, _, t|
       i.puts options[:stdin] if options[:stdin]
       i.close
       invoke_cache[argv] = o.readlines.map(&:chomp!)
+      status = t.value
     end
+    invoke_cache[argv] if status.success?
   end
 end

metadata CHANGED

@@ -1,27 +1,27 @@
 --- !ruby/object:Gem::Specification
 name: greeb
 version: !ruby/object:Gem::Version
-  version: 0.2.2.1
+  version: 0.2.3
 platform: ruby
 authors:
 - Dmitry Ustalov
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-11-15 00:00:00.000000000 Z
+date: 2014-05-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '5.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '5.0'
 description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
@@ -33,8 +33,8 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
-- .travis.yml
+- ".gitignore"
+- ".travis.yml"
 - Gemfile
 - LICENSE
 - README.md
@@ -68,17 +68,17 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project: greeb
-rubygems_version: 2.1.9
+rubygems_version: 2.2.2
 signing_key:
 specification_version: 4
 summary: Greeb is a simple Unicode-aware regexp-based tokenizer.