greeb 0.1.0.rc1 → 0.1.0.rc3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/.travis.yml +0 -1
- data/LICENSE +1 -1
- data/README.md +6 -5
- data/lib/greeb.rb +2 -0
- data/lib/greeb/segmentator.rb +2 -2
- data/lib/greeb/strscan.rb +20 -0
- data/lib/greeb/tokenizer.rb +8 -6
- data/lib/greeb/version.rb +1 -1
- data/spec/tokenizer_spec.rb +16 -0
- metadata +4 -3
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -79,7 +79,8 @@ such as sentence detection tasks:
|
|
79
79
|
|
80
80
|
```ruby
|
81
81
|
text = 'Hello! How are you?'
|
82
|
-
|
82
|
+
tokenizer = Greeb::Tokenizer.new(text)
|
83
|
+
pp Greeb::Segmentator.new(tokenizer).sentences
|
83
84
|
=begin
|
84
85
|
#<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
|
85
86
|
#<struct Greeb::Entity from=7, to=19, type=:sentence>}>
|
@@ -91,8 +92,8 @@ segmentator:
|
|
91
92
|
|
92
93
|
```ruby
|
93
94
|
text = 'Hello! How are you?'
|
94
|
-
|
95
|
-
sentences =
|
95
|
+
tokenizer = Greeb::Tokenizer.new(text)
|
96
|
+
sentences = Greeb::Segmentator.new(tokenizer).sentences
|
96
97
|
pp segmentator.extract(*sentences)
|
97
98
|
=begin
|
98
99
|
{#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
|
@@ -135,6 +136,6 @@ systematic and awesome.
|
|
135
136
|
|
136
137
|
## Copyright
|
137
138
|
|
138
|
-
Copyright (c) 2010-2012 [Dmitry
|
139
|
+
Copyright (c) 2010-2012 [Dmitry Ustalov]. See LICENSE for details.
|
139
140
|
|
140
|
-
[Dmitry
|
141
|
+
[Dmitry Ustalov]: http://eveel.ru
|
data/lib/greeb.rb
CHANGED
@@ -12,6 +12,7 @@ require 'greeb/version'
|
|
12
12
|
# `:break` for line endings.
|
13
13
|
#
|
14
14
|
class Greeb::Entity < Struct.new(:from, :to, :type)
|
15
|
+
# @private
|
15
16
|
def <=> other
|
16
17
|
if (comparison = self.from <=> other.from) == 0
|
17
18
|
self.to <=> other.to
|
@@ -21,5 +22,6 @@ class Greeb::Entity < Struct.new(:from, :to, :type)
|
|
21
22
|
end
|
22
23
|
end
|
23
24
|
|
25
|
+
require 'greeb/strscan'
|
24
26
|
require 'greeb/tokenizer'
|
25
27
|
require 'greeb/segmentator'
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -14,7 +14,7 @@ class Greeb::Segmentator
|
|
14
14
|
# Create a new instance of {Greeb::Segmentator}.
|
15
15
|
#
|
16
16
|
# @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
|
17
|
-
#
|
17
|
+
# {Greeb::Tokenizer} or set of its results.
|
18
18
|
#
|
19
19
|
def initialize tokenizer_or_tokens
|
20
20
|
@tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
|
@@ -38,7 +38,7 @@ class Greeb::Segmentator
|
|
38
38
|
# @param sentences [Array<Greeb::Entity>] a list of sentences.
|
39
39
|
#
|
40
40
|
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
41
|
-
#
|
41
|
+
# sentences as keys and tokens arrays as values.
|
42
42
|
#
|
43
43
|
def extract *sentences
|
44
44
|
Hash[
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'strscan'
|
4
|
+
|
5
|
+
# {StringScanner} provides for lexical scanning operations on a String.
|
6
|
+
# This implementation covers the byte slicing problem in the standard
|
7
|
+
# library's implementation.
|
8
|
+
#
|
9
|
+
class Greeb::StringScanner < StringScanner
|
10
|
+
# Returns the character position of the scan pointer. In the `reset`
|
11
|
+
# position, this value is zero. In the `terminated` position
|
12
|
+
# (i.e. the string is exhausted), this value is the length
|
13
|
+
# of the string.
|
14
|
+
#
|
15
|
+
# @return [Fixnum] the character position of the scan pointer.
|
16
|
+
#
|
17
|
+
def char_pos
|
18
|
+
string.byteslice(0...pos).length
|
19
|
+
end
|
20
|
+
end
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require 'strscan'
|
4
3
|
require 'set'
|
5
4
|
|
6
5
|
# Greeb's tokenization facilities. Use 'em with love.
|
@@ -61,7 +60,7 @@ class Greeb::Tokenizer
|
|
61
60
|
# @return [nil] nothing unless exception is raised.
|
62
61
|
#
|
63
62
|
def tokenize!
|
64
|
-
@scanner = StringScanner.new(text)
|
63
|
+
@scanner = Greeb::StringScanner.new(text)
|
65
64
|
@tokens = SortedSet.new
|
66
65
|
while !scanner.eos?
|
67
66
|
parse! LETTERS, :letter or
|
@@ -82,13 +81,16 @@ class Greeb::Tokenizer
|
|
82
81
|
#
|
83
82
|
# @param pattern [Regexp] a regular expression to extract the token.
|
84
83
|
# @param type [Symbol] a symbol that represents the necessary token
|
85
|
-
#
|
84
|
+
# type.
|
86
85
|
#
|
87
86
|
# @return [Set<Greeb::Entity>] the modified set of extracted tokens.
|
88
87
|
#
|
89
88
|
def parse! pattern, type
|
90
89
|
return false unless token = scanner.scan(pattern)
|
91
|
-
|
90
|
+
position = scanner.char_pos
|
91
|
+
@tokens << Greeb::Entity.new(position - token.length,
|
92
|
+
position,
|
93
|
+
type)
|
92
94
|
end
|
93
95
|
|
94
96
|
# Try to parse one small piece of text that is covered by pattern
|
@@ -97,13 +99,13 @@ class Greeb::Tokenizer
|
|
97
99
|
#
|
98
100
|
# @param pattern [Regexp] a regular expression to extract the token.
|
99
101
|
# @param type [Symbol] a symbol that represents the necessary token
|
100
|
-
#
|
102
|
+
# type.
|
101
103
|
#
|
102
104
|
# @return [Set<Greeb::Entity>] the modified set of extracted tokens.
|
103
105
|
#
|
104
106
|
def split_parse! pattern, type
|
105
107
|
return false unless token = scanner.scan(pattern)
|
106
|
-
position = scanner.
|
108
|
+
position = scanner.char_pos - token.length
|
107
109
|
token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
|
108
110
|
@tokens << Greeb::Entity.new(before, before + s.length, type)
|
109
111
|
before + s.length
|
data/lib/greeb/version.rb
CHANGED
data/spec/tokenizer_spec.rb
CHANGED
@@ -86,6 +86,22 @@ module Greeb
|
|
86
86
|
Entity.new(4, 7, :integer)])
|
87
87
|
)
|
88
88
|
end
|
89
|
+
|
90
|
+
it 'can deal with Russian language' do
|
91
|
+
Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
|
92
|
+
SortedSet.new([Entity.new(0, 8, :letter),
|
93
|
+
Entity.new(8, 9, :spunct),
|
94
|
+
Entity.new(9, 10, :separ),
|
95
|
+
Entity.new(10, 11, :letter),
|
96
|
+
Entity.new(11, 12, :separ),
|
97
|
+
Entity.new(12, 16, :letter),
|
98
|
+
Entity.new(16, 17, :separ),
|
99
|
+
Entity.new(17, 25, :letter),
|
100
|
+
Entity.new(25, 26, :separ),
|
101
|
+
Entity.new(26, 32, :letter),
|
102
|
+
Entity.new(32, 33, :punct)])
|
103
|
+
)
|
104
|
+
end
|
89
105
|
end
|
90
106
|
end
|
91
107
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.rc3
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -92,6 +92,7 @@ files:
|
|
92
92
|
- greeb.gemspec
|
93
93
|
- lib/greeb.rb
|
94
94
|
- lib/greeb/segmentator.rb
|
95
|
+
- lib/greeb/strscan.rb
|
95
96
|
- lib/greeb/tokenizer.rb
|
96
97
|
- lib/greeb/version.rb
|
97
98
|
- spec/segmentator_spec.rb
|
@@ -111,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
111
112
|
version: '0'
|
112
113
|
segments:
|
113
114
|
- 0
|
114
|
-
hash: -
|
115
|
+
hash: -2527935574265859361
|
115
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
117
|
none: false
|
117
118
|
requirements:
|