greeb 0.1.0.rc1 → 0.1.0.rc3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -25,10 +25,17 @@ nbproject
25
25
  ## RVM
26
26
  .rvmrc
27
27
 
28
+ ## RUBINIUS
29
+ .rbx
30
+
28
31
  ## BUNDLER
29
32
  .bundle
30
33
  Gemfile.lock
31
34
 
35
+ ## YARD
36
+ .yardoc
37
+ doc
38
+
32
39
  ## PROJECT::GENERAL
33
40
  coverage
34
41
  pkg
data/.travis.yml CHANGED
@@ -4,4 +4,3 @@ branches:
4
4
  - master
5
5
  rvm:
6
6
  - 1.9.3
7
- - rbx-19mode
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010-2012 Dmitry A. Ustalov
1
+ Copyright (c) 2010-2012 Dmitry Ustalov
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -79,7 +79,8 @@ such as sentence detection tasks:
79
79
 
80
80
  ```ruby
81
81
  text = 'Hello! How are you?'
82
- pp Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
82
+ tokenizer = Greeb::Tokenizer.new(text)
83
+ pp Greeb::Segmentator.new(tokenizer).sentences
83
84
  =begin
84
85
  #<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
85
86
  #<struct Greeb::Entity from=7, to=19, type=:sentence>}>
@@ -91,8 +92,8 @@ segmentator:
91
92
 
92
93
  ```ruby
93
94
  text = 'Hello! How are you?'
94
- segmentator = Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
95
- sentences = segmentator.sentences
95
+ tokenizer = Greeb::Tokenizer.new(text)
96
+ sentences = Greeb::Segmentator.new(tokenizer).sentences
96
97
  pp segmentator.extract(*sentences)
97
98
  =begin
98
99
  {#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
@@ -135,6 +136,6 @@ systematic and awesome.
135
136
 
136
137
  ## Copyright
137
138
 
138
- Copyright (c) 2010-2012 [Dmitry A. Ustalov]. See LICENSE for details.
139
+ Copyright (c) 2010-2012 [Dmitry Ustalov]. See LICENSE for details.
139
140
 
140
- [Dmitry A. Ustalov]: http://eveel.ru
141
+ [Dmitry Ustalov]: http://eveel.ru
data/lib/greeb.rb CHANGED
@@ -12,6 +12,7 @@ require 'greeb/version'
12
12
  # `:break` for line endings.
13
13
  #
14
14
  class Greeb::Entity < Struct.new(:from, :to, :type)
15
+ # @private
15
16
  def <=> other
16
17
  if (comparison = self.from <=> other.from) == 0
17
18
  self.to <=> other.to
@@ -21,5 +22,6 @@ class Greeb::Entity < Struct.new(:from, :to, :type)
21
22
  end
22
23
  end
23
24
 
25
+ require 'greeb/strscan'
24
26
  require 'greeb/tokenizer'
25
27
  require 'greeb/segmentator'
@@ -14,7 +14,7 @@ class Greeb::Segmentator
14
14
  # Create a new instance of {Greeb::Segmentator}.
15
15
  #
16
16
  # @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
17
- # Greeb::Tokenizer or set of its results.
17
+ # {Greeb::Tokenizer} or set of its results.
18
18
  #
19
19
  def initialize tokenizer_or_tokens
20
20
  @tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
@@ -38,7 +38,7 @@ class Greeb::Segmentator
38
38
  # @param sentences [Array<Greeb::Entity>] a list of sentences.
39
39
  #
40
40
  # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
41
- # sentences as keys and tokens arrays as values.
41
+ # sentences as keys and tokens arrays as values.
42
42
  #
43
43
  def extract *sentences
44
44
  Hash[
@@ -0,0 +1,20 @@
1
+ # encoding: utf-8
2
+
3
+ require 'strscan'
4
+
5
+ # {StringScanner} provides for lexical scanning operations on a String.
6
+ # This implementation covers the byte slicing problem in the standard
7
+ # library's implementation.
8
+ #
9
+ class Greeb::StringScanner < StringScanner
10
+ # Returns the character position of the scan pointer. In the `reset`
11
+ # position, this value is zero. In the `terminated` position
12
+ # (i.e. the string is exhausted), this value is the length
13
+ # of the string.
14
+ #
15
+ # @return [Fixnum] the character position of the scan pointer.
16
+ #
17
+ def char_pos
18
+ string.byteslice(0...pos).length
19
+ end
20
+ end
@@ -1,6 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'strscan'
4
3
  require 'set'
5
4
 
6
5
  # Greeb's tokenization facilities. Use 'em with love.
@@ -61,7 +60,7 @@ class Greeb::Tokenizer
61
60
  # @return [nil] nothing unless exception is raised.
62
61
  #
63
62
  def tokenize!
64
- @scanner = StringScanner.new(text)
63
+ @scanner = Greeb::StringScanner.new(text)
65
64
  @tokens = SortedSet.new
66
65
  while !scanner.eos?
67
66
  parse! LETTERS, :letter or
@@ -82,13 +81,16 @@ class Greeb::Tokenizer
82
81
  #
83
82
  # @param pattern [Regexp] a regular expression to extract the token.
84
83
  # @param type [Symbol] a symbol that represents the necessary token
85
- # type.
84
+ # type.
86
85
  #
87
86
  # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
88
87
  #
89
88
  def parse! pattern, type
90
89
  return false unless token = scanner.scan(pattern)
91
- @tokens << Greeb::Entity.new(scanner.pos - token.length, scanner.pos, type)
90
+ position = scanner.char_pos
91
+ @tokens << Greeb::Entity.new(position - token.length,
92
+ position,
93
+ type)
92
94
  end
93
95
 
94
96
  # Try to parse one small piece of text that is covered by pattern
@@ -97,13 +99,13 @@ class Greeb::Tokenizer
97
99
  #
98
100
  # @param pattern [Regexp] a regular expression to extract the token.
99
101
  # @param type [Symbol] a symbol that represents the necessary token
100
- # type.
102
+ # type.
101
103
  #
102
104
  # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
103
105
  #
104
106
  def split_parse! pattern, type
105
107
  return false unless token = scanner.scan(pattern)
106
- position = scanner.pos - token.length
108
+ position = scanner.char_pos - token.length
107
109
  token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
108
110
  @tokens << Greeb::Entity.new(before, before + s.length, type)
109
111
  before + s.length
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.1.0.rc1'
8
+ VERSION = '0.1.0.rc3'
9
9
  end
@@ -86,6 +86,22 @@ module Greeb
86
86
  Entity.new(4, 7, :integer)])
87
87
  )
88
88
  end
89
+
90
+ it 'can deal with Russian language' do
91
+ Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
92
+ SortedSet.new([Entity.new(0, 8, :letter),
93
+ Entity.new(8, 9, :spunct),
94
+ Entity.new(9, 10, :separ),
95
+ Entity.new(10, 11, :letter),
96
+ Entity.new(11, 12, :separ),
97
+ Entity.new(12, 16, :letter),
98
+ Entity.new(16, 17, :separ),
99
+ Entity.new(17, 25, :letter),
100
+ Entity.new(25, 26, :separ),
101
+ Entity.new(26, 32, :letter),
102
+ Entity.new(32, 33, :punct)])
103
+ )
104
+ end
89
105
  end
90
106
  end
91
107
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.rc1
4
+ version: 0.1.0.rc3
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-08 00:00:00.000000000 Z
12
+ date: 2012-07-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -92,6 +92,7 @@ files:
92
92
  - greeb.gemspec
93
93
  - lib/greeb.rb
94
94
  - lib/greeb/segmentator.rb
95
+ - lib/greeb/strscan.rb
95
96
  - lib/greeb/tokenizer.rb
96
97
  - lib/greeb/version.rb
97
98
  - spec/segmentator_spec.rb
@@ -111,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
111
112
  version: '0'
112
113
  segments:
113
114
  - 0
114
- hash: -4603914053803130942
115
+ hash: -2527935574265859361
115
116
  required_rubygems_version: !ruby/object:Gem::Requirement
116
117
  none: false
117
118
  requirements: