greeb 0.2.0.rc1 → 0.2.0.rc2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +14 -0
- data/greeb.gemspec +1 -1
- data/lib/greeb/parser.rb +13 -0
- data/lib/greeb/tokenizer.rb +14 -14
- data/lib/greeb/version.rb +1 -1
- data/spec/parser_spec.rb +16 -5
- data/spec/support/invoker.rb +1 -1
- data/spec/tokenizer_spec.rb +15 -0
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78e6a2b607a8690b2fe171665e35272efd31b2ac
|
4
|
+
data.tar.gz: 4517bb06cc8e1f8b0be5fc47bca4bfeda0fcfd49
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd24765af042566249e0c8d3153aee640c626b8f838fdda03c7b6598ad30bd0935df47ad1d1c6658ccf0e4fd598a94052c99dee127becb2f9c59b3e8dafe1cf0
|
7
|
+
data.tar.gz: 9eca1a25e8837732827a282d3d532db35113620372a0bde41b64669cb3fd696cc4c149267e19a5cfe58df7367b2932942554ea88cfd26826c73db926ad6ca89d
|
data/README.md
CHANGED
@@ -129,6 +129,7 @@ pp segmentator.extract(segmentator.sentences)
|
|
129
129
|
Texts are often include some special entities such as URLs and e-mail
|
130
130
|
addresses. Greeb can help you in these strings retrieval.
|
131
131
|
|
132
|
+
#### URL and E-mail retrieval
|
132
133
|
```ruby
|
133
134
|
text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
|
134
135
|
|
@@ -145,6 +146,19 @@ pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
|
|
145
146
|
|
146
147
|
Please don't use Greeb in spam lists development purposes.
|
147
148
|
|
149
|
+
#### Abbreviation retrieval
|
150
|
+
```ruby
|
151
|
+
text = 'Hello, G.L.H.F. everyone!'
|
152
|
+
|
153
|
+
pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
|
154
|
+
=begin
|
155
|
+
[[#<struct Greeb::Entity from=7, to=15, type=:abbrev>, "G.L.H.F."]]
|
156
|
+
=end
|
157
|
+
```
|
158
|
+
|
159
|
+
The algorithm is not so accurate, but still useful in many practical
|
160
|
+
situations.
|
161
|
+
|
148
162
|
## Tokens
|
149
163
|
Greeb operates with entities, tuples of *(from, to, kind)*, where
|
150
164
|
*from* is a beginning of the entity, *to* is an ending of the entity,
|
data/greeb.gemspec
CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
|
|
16
16
|
s.rubyforge_project = 'greeb'
|
17
17
|
|
18
18
|
s.add_development_dependency 'rake'
|
19
|
-
s.add_development_dependency 'minitest', '
|
19
|
+
s.add_development_dependency 'minitest', '~> 5.0'
|
20
20
|
|
21
21
|
s.files = `git ls-files`.split("\n")
|
22
22
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/lib/greeb/parser.rb
CHANGED
@@ -13,6 +13,9 @@ module Greeb::Parser
|
|
13
13
|
# A horrible e-mail pattern.
|
14
14
|
EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
|
15
15
|
|
16
|
+
# Another horrible pattern. Now for abbreviations.
|
17
|
+
ABBREV = /\b(\p{L}\.)+/i
|
18
|
+
|
16
19
|
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
17
20
|
# and this code should be rewritten to use the URI concept.
|
18
21
|
#
|
@@ -34,6 +37,16 @@ module Greeb::Parser
|
|
34
37
|
scan(text, EMAIL, :email)
|
35
38
|
end
|
36
39
|
|
40
|
+
# Recognize abbreviations in the input text.
|
41
|
+
#
|
42
|
+
# @param text [String] input text.
|
43
|
+
#
|
44
|
+
# @return [Array<Greeb::Entity>] found abbreviations.
|
45
|
+
#
|
46
|
+
def abbrevs(text)
|
47
|
+
scan(text, ABBREV, :abbrev)
|
48
|
+
end
|
49
|
+
|
37
50
|
private
|
38
51
|
# Implementation of regexp-based {Greeb::Entity} scanner.
|
39
52
|
#
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -57,6 +57,20 @@ module Greeb::Tokenizer
|
|
57
57
|
scanner.terminate
|
58
58
|
end
|
59
59
|
|
60
|
+
# Split one line into characters array, but also combine duplicated
|
61
|
+
# characters.
|
62
|
+
#
|
63
|
+
# For instance, `"a b\n\n\nc"` would be transformed into the following
|
64
|
+
# array: `["a", " ", "b", "\n\n\n", "c"]`.
|
65
|
+
#
|
66
|
+
# @param token [String] a token to be splitted.
|
67
|
+
#
|
68
|
+
# @return [Array<String>] splitted characters.
|
69
|
+
#
|
70
|
+
def split(token)
|
71
|
+
token.scan(/((.|\n)\2*)/).map(&:first)
|
72
|
+
end
|
73
|
+
|
60
74
|
protected
|
61
75
|
# One iteration of the tokenization process.
|
62
76
|
#
|
@@ -115,18 +129,4 @@ module Greeb::Tokenizer
|
|
115
129
|
before + s.length
|
116
130
|
end
|
117
131
|
end
|
118
|
-
|
119
|
-
# Split one line into characters array, but also combine line breaks
|
120
|
-
# into single elements.
|
121
|
-
#
|
122
|
-
# For instance, `"a b\n\n\nc"` would be transformed into the following
|
123
|
-
# array: `["a", " ", "b", "\n\n\n", "c"]`.
|
124
|
-
#
|
125
|
-
# @param token [String] a token to be splitted.
|
126
|
-
#
|
127
|
-
# @return [Array<String>] splitted characters.
|
128
|
-
#
|
129
|
-
def split(token)
|
130
|
-
token.scan(/((.|\n)\2*)/).map(&:first)
|
131
|
-
end
|
132
132
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/parser_spec.rb
CHANGED
@@ -5,9 +5,9 @@ require_relative 'spec_helper'
|
|
5
5
|
module Greeb
|
6
6
|
describe Parser do
|
7
7
|
let(:text) do
|
8
|
-
'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
|
8
|
+
'Hello there! My name is Vasya B. My website is: http://вася.рф/. ' \
|
9
9
|
'And my e-mail is example@example.com! Also it is available by ' \
|
10
|
-
'URL: http://vasya.ru.'
|
10
|
+
'URL: http://vasya.ru. Also, G.L.H.F. everyone!'
|
11
11
|
end
|
12
12
|
|
13
13
|
describe 'URL' do
|
@@ -15,8 +15,8 @@ module Greeb
|
|
15
15
|
|
16
16
|
it 'recognizes URLs' do
|
17
17
|
subject.must_equal(
|
18
|
-
[Entity.new(
|
19
|
-
Entity.new(
|
18
|
+
[Entity.new(48, 63, :url),
|
19
|
+
Entity.new(132, 147, :url)]
|
20
20
|
)
|
21
21
|
end
|
22
22
|
end
|
@@ -26,7 +26,18 @@ module Greeb
|
|
26
26
|
|
27
27
|
it 'recognizes e-mails' do
|
28
28
|
subject.must_equal(
|
29
|
-
[Entity.new(
|
29
|
+
[Entity.new(82, 101, :email)]
|
30
|
+
)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe 'ABBREV' do
|
35
|
+
subject { Parser.abbrevs(text) }
|
36
|
+
|
37
|
+
it 'recognizes abbreviations' do
|
38
|
+
subject.must_equal(
|
39
|
+
[Entity.new(30, 32, :abbrev),
|
40
|
+
Entity.new(155, 163, :abbrev)]
|
30
41
|
)
|
31
42
|
end
|
32
43
|
end
|
data/spec/support/invoker.rb
CHANGED
data/spec/tokenizer_spec.rb
CHANGED
@@ -79,5 +79,20 @@ module Greeb
|
|
79
79
|
)
|
80
80
|
end
|
81
81
|
end
|
82
|
+
|
83
|
+
describe '.split' do
|
84
|
+
it 'should split characters' do
|
85
|
+
Tokenizer.split('loh').must_equal %w(l o h)
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should combine duplicated characters' do
|
89
|
+
Tokenizer.split('foo').must_equal %w(f oo)
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'should also deal with line breaks' do
|
93
|
+
Tokenizer.split("bar\n\nbaz").must_equal(
|
94
|
+
[*%w(b a r), "\n\n", *%w(b a z)])
|
95
|
+
end
|
96
|
+
end
|
82
97
|
end
|
83
98
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.
|
4
|
+
version: 0.2.0.rc2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-05-
|
11
|
+
date: 2013-05-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -28,16 +28,16 @@ dependencies:
|
|
28
28
|
name: minitest
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '5.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '5.0'
|
41
41
|
description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
|
42
42
|
written in Ruby.
|
43
43
|
email:
|