RubyGems - greeb - Versions diffs - 0.1.0.rc1 → 0.1.0.rc3 - Mend

greeb 0.1.0.rc1 → 0.1.0.rc3

Files changed (11) hide show

data/.gitignore CHANGED Viewed

@@ -25,10 +25,17 @@ nbproject
 ## RVM
 .rvmrc
+## RUBINIUS
+.rbx
 ## BUNDLER
 .bundle
 Gemfile.lock
+## YARD
+.yardoc
+doc
 ## PROJECT::GENERAL
 coverage
 pkg

data/.travis.yml CHANGED Viewed

@@ -4,4 +4,3 @@ branches:
     - master
 rvm:
   - 1.9.3
-  - rbx-19mode

data/LICENSE CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2010-2012 Dmitry A. Ustalov
+Copyright (c) 2010-2012 Dmitry Ustalov
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED Viewed

@@ -79,7 +79,8 @@ such as sentence detection tasks:
 ```ruby
 text = 'Hello! How are you?'
-pp Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
+tokenizer = Greeb::Tokenizer.new(text)
+pp Greeb::Segmentator.new(tokenizer).sentences
 =begin
 #<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
  #<struct Greeb::Entity from=7, to=19, type=:sentence>}>
@@ -91,8 +92,8 @@ segmentator:
 ```ruby
 text = 'Hello! How are you?'
-segmentator = Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
-sentences = segmentator.sentences
+tokenizer = Greeb::Tokenizer.new(text)
+sentences = Greeb::Segmentator.new(tokenizer).sentences
 pp segmentator.extract(*sentences)
 =begin
 {#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
@@ -135,6 +136,6 @@ systematic and awesome.
 ## Copyright
-Copyright (c) 2010-2012 [Dmitry A. Ustalov]. See LICENSE for details.
+Copyright (c) 2010-2012 [Dmitry Ustalov]. See LICENSE for details.
-[Dmitry A. Ustalov]: http://eveel.ru
+[Dmitry Ustalov]: http://eveel.ru

data/lib/greeb.rb CHANGED Viewed

@@ -12,6 +12,7 @@ require 'greeb/version'
 # `:break` for line endings.
 #
 class Greeb::Entity < Struct.new(:from, :to, :type)
+  # @private
   def <=> other
     if (comparison = self.from <=> other.from) == 0
       self.to <=> other.to
@@ -21,5 +22,6 @@ class Greeb::Entity < Struct.new(:from, :to, :type)
   end
 end
+require 'greeb/strscan'
 require 'greeb/tokenizer'
 require 'greeb/segmentator'

data/lib/greeb/segmentator.rb CHANGED Viewed

@@ -14,7 +14,7 @@ class Greeb::Segmentator
   # Create a new instance of {Greeb::Segmentator}.
   #
   # @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
-  # Greeb::Tokenizer or set of its results.
+  #   {Greeb::Tokenizer} or set of its results.
   #
   def initialize tokenizer_or_tokens
     @tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
@@ -38,7 +38,7 @@ class Greeb::Segmentator
   # @param sentences [Array<Greeb::Entity>] a list of sentences.
   #
   # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
-  # sentences as keys and tokens arrays as values.
+  #   sentences as keys and tokens arrays as values.
   #
   def extract *sentences
     Hash[

data/lib/greeb/strscan.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# encoding: utf-8
+require 'strscan'
+# {StringScanner} provides for lexical scanning operations on a String.
+# This implementation covers the byte slicing problem in the standard
+# library's implementation.
+#
+class Greeb::StringScanner < StringScanner
+  # Returns the character position of the scan pointer. In the `reset`
+  # position, this value is zero. In the `terminated` position
+  # (i.e. the string is exhausted), this value is the length
+  # of the string.
+  #
+  # @return [Fixnum] the character position of the scan pointer.
+  #
+  def char_pos
+    string.byteslice(0...pos).length
+  end
+end

data/lib/greeb/tokenizer.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 # encoding: utf-8
-require 'strscan'
 require 'set'
 # Greeb's tokenization facilities. Use 'em with love.
@@ -61,7 +60,7 @@ class Greeb::Tokenizer
     # @return [nil] nothing unless exception is raised.
     #
     def tokenize!
-      @scanner = StringScanner.new(text)
+      @scanner = Greeb::StringScanner.new(text)
       @tokens = SortedSet.new
       while !scanner.eos?
         parse! LETTERS, :letter or
@@ -82,13 +81,16 @@ class Greeb::Tokenizer
     #
     # @param pattern [Regexp] a regular expression to extract the token.
     # @param type [Symbol] a symbol that represents the necessary token
-    # type.
+    #   type.
     #
     # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
     #
     def parse! pattern, type
       return false unless token = scanner.scan(pattern)
-      @tokens << Greeb::Entity.new(scanner.pos - token.length, scanner.pos, type)
+      position = scanner.char_pos
+      @tokens << Greeb::Entity.new(position - token.length,
+                                   position,
+                                   type)
     end
     # Try to parse one small piece of text that is covered by pattern
@@ -97,13 +99,13 @@ class Greeb::Tokenizer
     #
     # @param pattern [Regexp] a regular expression to extract the token.
     # @param type [Symbol] a symbol that represents the necessary token
-    # type.
+    #   type.
     #
     # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
     #
     def split_parse! pattern, type
       return false unless token = scanner.scan(pattern)
-      position = scanner.pos - token.length
+      position = scanner.char_pos - token.length
       token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
         @tokens << Greeb::Entity.new(before, before + s.length, type)
         before + s.length

data/lib/greeb/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module Greeb
   # Version of Greeb.
   #
-  VERSION = '0.1.0.rc1'
+  VERSION = '0.1.0.rc3'
 end

data/spec/tokenizer_spec.rb CHANGED Viewed

@@ -86,6 +86,22 @@ module Greeb
                          Entity.new(4, 7, :integer)])
         )
       end
+      it 'can deal with Russian language' do
+        Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
+          SortedSet.new([Entity.new(0,  8,  :letter),
+                         Entity.new(8,  9,  :spunct),
+                         Entity.new(9,  10, :separ),
+                         Entity.new(10, 11, :letter),
+                         Entity.new(11, 12, :separ),
+                         Entity.new(12, 16, :letter),
+                         Entity.new(16, 17, :separ),
+                         Entity.new(17, 25, :letter),
+                         Entity.new(25, 26, :separ),
+                         Entity.new(26, 32, :letter),
+                         Entity.new(32, 33, :punct)])
+          )
+      end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: greeb
 version: !ruby/object:Gem::Version
-  version: 0.1.0.rc1
+  version: 0.1.0.rc3
   prerelease: 6
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-08 00:00:00.000000000 Z
+date: 2012-07-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -92,6 +92,7 @@ files:
 - greeb.gemspec
 - lib/greeb.rb
 - lib/greeb/segmentator.rb
+- lib/greeb/strscan.rb
 - lib/greeb/tokenizer.rb
 - lib/greeb/version.rb
 - spec/segmentator_spec.rb
@@ -111,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -4603914053803130942
+      hash: -2527935574265859361
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements: