greeb 0.1.0.rc6 → 0.1.0.rc7
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/greeb/tokenizer.rb +10 -2
- data/lib/greeb/version.rb +1 -1
- metadata +3 -3
data/lib/greeb/tokenizer.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
# Greeb's tokenization facilities. Use 'em with love.
|
3
|
+
# Greeb's tokenization facilities. Use 'em with love.
|
4
|
+
#
|
5
|
+
# Unicode character categories been obtained from
|
6
|
+
# <http://www.fileformat.info/info/unicode/category/index.htm>.
|
4
7
|
#
|
5
8
|
class Greeb::Tokenizer
|
6
9
|
# This runtime error appears when {Greeb::Tokenizer} tries to recognize
|
@@ -43,12 +46,16 @@ class Greeb::Tokenizer
|
|
43
46
|
|
44
47
|
# In-subsentence seprator (i.e.: "*" or "=").
|
45
48
|
#
|
46
|
-
SEPARATORS = /[ \p{
|
49
|
+
SEPARATORS = /[ \p{Nl}\p{No}\p{Pd}\p{Pc}\p{Po}\p{Sm}\p{So}\p{Sc}\p{Z}]+/u
|
47
50
|
|
48
51
|
# Line breaks.
|
49
52
|
#
|
50
53
|
BREAKS = /(\r\n|\n|\r)+/u
|
51
54
|
|
55
|
+
# Residuals.
|
56
|
+
#
|
57
|
+
RESIDUALS = /[\p{C}\p{M}\p{Sk}]+/u
|
58
|
+
|
52
59
|
attr_reader :text, :scanner
|
53
60
|
protected :scanner
|
54
61
|
|
@@ -86,6 +93,7 @@ class Greeb::Tokenizer
|
|
86
93
|
split_parse! PUNCTUATIONS, :punct or
|
87
94
|
split_parse! SEPARATORS, :separ or
|
88
95
|
split_parse! BREAKS, :break or
|
96
|
+
parse! RESIDUALS, :residual or
|
89
97
|
raise UnknownEntity.new(text, scanner.char_pos)
|
90
98
|
end
|
91
99
|
ensure
|
data/lib/greeb/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.rc7
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -109,7 +109,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
109
109
|
requirements:
|
110
110
|
- - ! '>='
|
111
111
|
- !ruby/object:Gem::Version
|
112
|
-
hash:
|
112
|
+
hash: 2716089438708653231
|
113
113
|
version: '0'
|
114
114
|
segments:
|
115
115
|
- 0
|