greeb 0.1.0.rc1 → 0.1.0.rc3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -0
- data/.travis.yml +0 -1
- data/LICENSE +1 -1
- data/README.md +6 -5
- data/lib/greeb.rb +2 -0
- data/lib/greeb/segmentator.rb +2 -2
- data/lib/greeb/strscan.rb +20 -0
- data/lib/greeb/tokenizer.rb +8 -6
- data/lib/greeb/version.rb +1 -1
- data/spec/tokenizer_spec.rb +16 -0
- metadata +4 -3
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -79,7 +79,8 @@ such as sentence detection tasks:
|
|
79
79
|
|
80
80
|
```ruby
|
81
81
|
text = 'Hello! How are you?'
|
82
|
-
|
82
|
+
tokenizer = Greeb::Tokenizer.new(text)
|
83
|
+
pp Greeb::Segmentator.new(tokenizer).sentences
|
83
84
|
=begin
|
84
85
|
#<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
|
85
86
|
#<struct Greeb::Entity from=7, to=19, type=:sentence>}>
|
@@ -91,8 +92,8 @@ segmentator:
|
|
91
92
|
|
92
93
|
```ruby
|
93
94
|
text = 'Hello! How are you?'
|
94
|
-
|
95
|
-
sentences =
|
95
|
+
tokenizer = Greeb::Tokenizer.new(text)
|
96
|
+
sentences = Greeb::Segmentator.new(tokenizer).sentences
|
96
97
|
pp segmentator.extract(*sentences)
|
97
98
|
=begin
|
98
99
|
{#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
|
@@ -135,6 +136,6 @@ systematic and awesome.
|
|
135
136
|
|
136
137
|
## Copyright
|
137
138
|
|
138
|
-
Copyright (c) 2010-2012 [Dmitry
|
139
|
+
Copyright (c) 2010-2012 [Dmitry Ustalov]. See LICENSE for details.
|
139
140
|
|
140
|
-
[Dmitry
|
141
|
+
[Dmitry Ustalov]: http://eveel.ru
|
data/lib/greeb.rb
CHANGED
@@ -12,6 +12,7 @@ require 'greeb/version'
|
|
12
12
|
# `:break` for line endings.
|
13
13
|
#
|
14
14
|
class Greeb::Entity < Struct.new(:from, :to, :type)
|
15
|
+
# @private
|
15
16
|
def <=> other
|
16
17
|
if (comparison = self.from <=> other.from) == 0
|
17
18
|
self.to <=> other.to
|
@@ -21,5 +22,6 @@ class Greeb::Entity < Struct.new(:from, :to, :type)
|
|
21
22
|
end
|
22
23
|
end
|
23
24
|
|
25
|
+
require 'greeb/strscan'
|
24
26
|
require 'greeb/tokenizer'
|
25
27
|
require 'greeb/segmentator'
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -14,7 +14,7 @@ class Greeb::Segmentator
|
|
14
14
|
# Create a new instance of {Greeb::Segmentator}.
|
15
15
|
#
|
16
16
|
# @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
|
17
|
-
#
|
17
|
+
# {Greeb::Tokenizer} or set of its results.
|
18
18
|
#
|
19
19
|
def initialize tokenizer_or_tokens
|
20
20
|
@tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
|
@@ -38,7 +38,7 @@ class Greeb::Segmentator
|
|
38
38
|
# @param sentences [Array<Greeb::Entity>] a list of sentences.
|
39
39
|
#
|
40
40
|
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
41
|
-
#
|
41
|
+
# sentences as keys and tokens arrays as values.
|
42
42
|
#
|
43
43
|
def extract *sentences
|
44
44
|
Hash[
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'strscan'
|
4
|
+
|
5
|
+
# {StringScanner} provides for lexical scanning operations on a String.
|
6
|
+
# This implementation covers the byte slicing problem in the standard
|
7
|
+
# library's implementation.
|
8
|
+
#
|
9
|
+
class Greeb::StringScanner < StringScanner
|
10
|
+
# Returns the character position of the scan pointer. In the `reset`
|
11
|
+
# position, this value is zero. In the `terminated` position
|
12
|
+
# (i.e. the string is exhausted), this value is the length
|
13
|
+
# of the string.
|
14
|
+
#
|
15
|
+
# @return [Fixnum] the character position of the scan pointer.
|
16
|
+
#
|
17
|
+
def char_pos
|
18
|
+
string.byteslice(0...pos).length
|
19
|
+
end
|
20
|
+
end
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require 'strscan'
|
4
3
|
require 'set'
|
5
4
|
|
6
5
|
# Greeb's tokenization facilities. Use 'em with love.
|
@@ -61,7 +60,7 @@ class Greeb::Tokenizer
|
|
61
60
|
# @return [nil] nothing unless exception is raised.
|
62
61
|
#
|
63
62
|
def tokenize!
|
64
|
-
@scanner = StringScanner.new(text)
|
63
|
+
@scanner = Greeb::StringScanner.new(text)
|
65
64
|
@tokens = SortedSet.new
|
66
65
|
while !scanner.eos?
|
67
66
|
parse! LETTERS, :letter or
|
@@ -82,13 +81,16 @@ class Greeb::Tokenizer
|
|
82
81
|
#
|
83
82
|
# @param pattern [Regexp] a regular expression to extract the token.
|
84
83
|
# @param type [Symbol] a symbol that represents the necessary token
|
85
|
-
#
|
84
|
+
# type.
|
86
85
|
#
|
87
86
|
# @return [Set<Greeb::Entity>] the modified set of extracted tokens.
|
88
87
|
#
|
89
88
|
def parse! pattern, type
|
90
89
|
return false unless token = scanner.scan(pattern)
|
91
|
-
|
90
|
+
position = scanner.char_pos
|
91
|
+
@tokens << Greeb::Entity.new(position - token.length,
|
92
|
+
position,
|
93
|
+
type)
|
92
94
|
end
|
93
95
|
|
94
96
|
# Try to parse one small piece of text that is covered by pattern
|
@@ -97,13 +99,13 @@ class Greeb::Tokenizer
|
|
97
99
|
#
|
98
100
|
# @param pattern [Regexp] a regular expression to extract the token.
|
99
101
|
# @param type [Symbol] a symbol that represents the necessary token
|
100
|
-
#
|
102
|
+
# type.
|
101
103
|
#
|
102
104
|
# @return [Set<Greeb::Entity>] the modified set of extracted tokens.
|
103
105
|
#
|
104
106
|
def split_parse! pattern, type
|
105
107
|
return false unless token = scanner.scan(pattern)
|
106
|
-
position = scanner.
|
108
|
+
position = scanner.char_pos - token.length
|
107
109
|
token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
|
108
110
|
@tokens << Greeb::Entity.new(before, before + s.length, type)
|
109
111
|
before + s.length
|
data/lib/greeb/version.rb
CHANGED
data/spec/tokenizer_spec.rb
CHANGED
@@ -86,6 +86,22 @@ module Greeb
|
|
86
86
|
Entity.new(4, 7, :integer)])
|
87
87
|
)
|
88
88
|
end
|
89
|
+
|
90
|
+
it 'can deal with Russian language' do
|
91
|
+
Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
|
92
|
+
SortedSet.new([Entity.new(0, 8, :letter),
|
93
|
+
Entity.new(8, 9, :spunct),
|
94
|
+
Entity.new(9, 10, :separ),
|
95
|
+
Entity.new(10, 11, :letter),
|
96
|
+
Entity.new(11, 12, :separ),
|
97
|
+
Entity.new(12, 16, :letter),
|
98
|
+
Entity.new(16, 17, :separ),
|
99
|
+
Entity.new(17, 25, :letter),
|
100
|
+
Entity.new(25, 26, :separ),
|
101
|
+
Entity.new(26, 32, :letter),
|
102
|
+
Entity.new(32, 33, :punct)])
|
103
|
+
)
|
104
|
+
end
|
89
105
|
end
|
90
106
|
end
|
91
107
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.rc3
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -92,6 +92,7 @@ files:
|
|
92
92
|
- greeb.gemspec
|
93
93
|
- lib/greeb.rb
|
94
94
|
- lib/greeb/segmentator.rb
|
95
|
+
- lib/greeb/strscan.rb
|
95
96
|
- lib/greeb/tokenizer.rb
|
96
97
|
- lib/greeb/version.rb
|
97
98
|
- spec/segmentator_spec.rb
|
@@ -111,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
111
112
|
version: '0'
|
112
113
|
segments:
|
113
114
|
- 0
|
114
|
-
hash: -
|
115
|
+
hash: -2527935574265859361
|
115
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
117
|
none: false
|
117
118
|
requirements:
|