RubyGems - greeb - Versions diffs - 0.1.0.rc1 → 0.1.0.rc3 - Mend

greeb 0.1.0.rc1 → 0.1.0.rc3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/.gitignore CHANGED Viewed

@@ -25,10 +25,17 @@ nbproject
 ## RVM
 .rvmrc
+## RUBINIUS
+.rbx
 ## BUNDLER
 .bundle
 Gemfile.lock
+## YARD
+.yardoc
+doc
 ## PROJECT::GENERAL
 coverage
 pkg

data/.travis.yml CHANGED Viewed

@@ -4,4 +4,3 @@ branches:
     - master
 rvm:
   - 1.9.3
-  - rbx-19mode

data/LICENSE CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2010-2012 Dmitry A. Ustalov
+Copyright (c) 2010-2012 Dmitry Ustalov
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED Viewed

@@ -79,7 +79,8 @@ such as sentence detection tasks:
 ```ruby
 text = 'Hello! How are you?'
-pp Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
+tokenizer = Greeb::Tokenizer.new(text)
+pp Greeb::Segmentator.new(tokenizer).sentences
 =begin
 #<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
  #<struct Greeb::Entity from=7, to=19, type=:sentence>}>
@@ -91,8 +92,8 @@ segmentator:
 ```ruby
 text = 'Hello! How are you?'
-segmentator = Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
-sentences = segmentator.sentences
+tokenizer = Greeb::Tokenizer.new(text)
+sentences = Greeb::Segmentator.new(tokenizer).sentences
 pp segmentator.extract(*sentences)
 =begin
 {#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
@@ -135,6 +136,6 @@ systematic and awesome.
 ## Copyright
-Copyright (c) 2010-2012 [Dmitry A. Ustalov]. See LICENSE for details.
+Copyright (c) 2010-2012 [Dmitry Ustalov]. See LICENSE for details.
-[Dmitry A. Ustalov]: http://eveel.ru
+[Dmitry Ustalov]: http://eveel.ru

data/lib/greeb.rb CHANGED Viewed

@@ -12,6 +12,7 @@ require 'greeb/version'
 # `:break` for line endings.
 #
 class Greeb::Entity < Struct.new(:from, :to, :type)
+  # @private
   def <=> other
     if (comparison = self.from <=> other.from) == 0
       self.to <=> other.to
@@ -21,5 +22,6 @@ class Greeb::Entity < Struct.new(:from, :to, :type)
   end
 end
+require 'greeb/strscan'
 require 'greeb/tokenizer'
 require 'greeb/segmentator'

data/lib/greeb/segmentator.rb CHANGED Viewed

@@ -14,7 +14,7 @@ class Greeb::Segmentator
   # Create a new instance of {Greeb::Segmentator}.
   #
   # @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
-  # Greeb::Tokenizer or set of its results.
+  #   {Greeb::Tokenizer} or set of its results.
   #
   def initialize tokenizer_or_tokens
     @tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
@@ -38,7 +38,7 @@ class Greeb::Segmentator
   # @param sentences [Array<Greeb::Entity>] a list of sentences.
   #
   # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
-  # sentences as keys and tokens arrays as values.
+  #   sentences as keys and tokens arrays as values.
   #
   def extract *sentences
     Hash[

data/lib/greeb/strscan.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# encoding: utf-8
+require 'strscan'
+# {StringScanner} provides for lexical scanning operations on a String.
+# This implementation covers the byte slicing problem in the standard
+# library's implementation.
+#
+class Greeb::StringScanner < StringScanner
+  # Returns the character position of the scan pointer. In the `reset`
+  # position, this value is zero. In the `terminated` position
+  # (i.e. the string is exhausted), this value is the length
+  # of the string.
+  #
+  # @return [Fixnum] the character position of the scan pointer.
+  #
+  def char_pos
+    string.byteslice(0...pos).length
+  end
+end

data/lib/greeb/tokenizer.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 # encoding: utf-8
-require 'strscan'
 require 'set'
 # Greeb's tokenization facilities. Use 'em with love.
@@ -61,7 +60,7 @@ class Greeb::Tokenizer
     # @return [nil] nothing unless exception is raised.
     #
     def tokenize!
-      @scanner = StringScanner.new(text)
+      @scanner = Greeb::StringScanner.new(text)
       @tokens = SortedSet.new
       while !scanner.eos?
         parse! LETTERS, :letter or
@@ -82,13 +81,16 @@ class Greeb::Tokenizer
     #
     # @param pattern [Regexp] a regular expression to extract the token.
     # @param type [Symbol] a symbol that represents the necessary token
-    # type.
+    #   type.
     #
     # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
     #
     def parse! pattern, type
       return false unless token = scanner.scan(pattern)
-      @tokens << Greeb::Entity.new(scanner.pos - token.length, scanner.pos, type)
+      position = scanner.char_pos
+      @tokens << Greeb::Entity.new(position - token.length,
+                                   position,
+                                   type)
     end
     # Try to parse one small piece of text that is covered by pattern
@@ -97,13 +99,13 @@ class Greeb::Tokenizer
     #
     # @param pattern [Regexp] a regular expression to extract the token.
     # @param type [Symbol] a symbol that represents the necessary token
-    # type.
+    #   type.
     #
     # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
     #
     def split_parse! pattern, type
       return false unless token = scanner.scan(pattern)
-      position = scanner.pos - token.length
+      position = scanner.char_pos - token.length
       token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
         @tokens << Greeb::Entity.new(before, before + s.length, type)
         before + s.length

data/lib/greeb/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module Greeb
   # Version of Greeb.
   #
-  VERSION = '0.1.0.rc1'
+  VERSION = '0.1.0.rc3'
 end

data/spec/tokenizer_spec.rb CHANGED Viewed

@@ -86,6 +86,22 @@ module Greeb
                          Entity.new(4, 7, :integer)])
         )
       end
+      it 'can deal with Russian language' do
+        Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
+          SortedSet.new([Entity.new(0,  8,  :letter),
+                         Entity.new(8,  9,  :spunct),
+                         Entity.new(9,  10, :separ),
+                         Entity.new(10, 11, :letter),
+                         Entity.new(11, 12, :separ),
+                         Entity.new(12, 16, :letter),
+                         Entity.new(16, 17, :separ),
+                         Entity.new(17, 25, :letter),
+                         Entity.new(25, 26, :separ),
+                         Entity.new(26, 32, :letter),
+                         Entity.new(32, 33, :punct)])
+          )
+      end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: greeb
 version: !ruby/object:Gem::Version
-  version: 0.1.0.rc1
+  version: 0.1.0.rc3
   prerelease: 6
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-08 00:00:00.000000000 Z
+date: 2012-07-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -92,6 +92,7 @@ files:
 - greeb.gemspec
 - lib/greeb.rb
 - lib/greeb/segmentator.rb
+- lib/greeb/strscan.rb
 - lib/greeb/tokenizer.rb
 - lib/greeb/version.rb
 - spec/segmentator_spec.rb
@@ -111,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -4603914053803130942
+      hash: -2527935574265859361
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements: