greeb 0.2.0.rc1 → 0.2.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +14 -0
- data/greeb.gemspec +1 -1
- data/lib/greeb/parser.rb +13 -0
- data/lib/greeb/tokenizer.rb +14 -14
- data/lib/greeb/version.rb +1 -1
- data/spec/parser_spec.rb +16 -5
- data/spec/support/invoker.rb +1 -1
- data/spec/tokenizer_spec.rb +15 -0
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 78e6a2b607a8690b2fe171665e35272efd31b2ac
|
|
4
|
+
data.tar.gz: 4517bb06cc8e1f8b0be5fc47bca4bfeda0fcfd49
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dd24765af042566249e0c8d3153aee640c626b8f838fdda03c7b6598ad30bd0935df47ad1d1c6658ccf0e4fd598a94052c99dee127becb2f9c59b3e8dafe1cf0
|
|
7
|
+
data.tar.gz: 9eca1a25e8837732827a282d3d532db35113620372a0bde41b64669cb3fd696cc4c149267e19a5cfe58df7367b2932942554ea88cfd26826c73db926ad6ca89d
|
data/README.md
CHANGED
|
@@ -129,6 +129,7 @@ pp segmentator.extract(segmentator.sentences)
|
|
|
129
129
|
Texts are often include some special entities such as URLs and e-mail
|
|
130
130
|
addresses. Greeb can help you in these strings retrieval.
|
|
131
131
|
|
|
132
|
+
#### URL and E-mail retrieval
|
|
132
133
|
```ruby
|
|
133
134
|
text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
|
|
134
135
|
|
|
@@ -145,6 +146,19 @@ pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
|
|
|
145
146
|
|
|
146
147
|
Please don't use Greeb in spam lists development purposes.
|
|
147
148
|
|
|
149
|
+
#### Abbreviation retrieval
|
|
150
|
+
```ruby
|
|
151
|
+
text = 'Hello, G.L.H.F. everyone!'
|
|
152
|
+
|
|
153
|
+
pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
|
|
154
|
+
=begin
|
|
155
|
+
[[#<struct Greeb::Entity from=7, to=15, type=:abbrev>, "G.L.H.F."]]
|
|
156
|
+
=end
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
The algorithm is not so accurate, but still useful in many practical
|
|
160
|
+
situations.
|
|
161
|
+
|
|
148
162
|
## Tokens
|
|
149
163
|
Greeb operates with entities, tuples of *(from, to, kind)*, where
|
|
150
164
|
*from* is a beginning of the entity, *to* is an ending of the entity,
|
data/greeb.gemspec
CHANGED
|
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
|
|
|
16
16
|
s.rubyforge_project = 'greeb'
|
|
17
17
|
|
|
18
18
|
s.add_development_dependency 'rake'
|
|
19
|
-
s.add_development_dependency 'minitest', '
|
|
19
|
+
s.add_development_dependency 'minitest', '~> 5.0'
|
|
20
20
|
|
|
21
21
|
s.files = `git ls-files`.split("\n")
|
|
22
22
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/lib/greeb/parser.rb
CHANGED
|
@@ -13,6 +13,9 @@ module Greeb::Parser
|
|
|
13
13
|
# A horrible e-mail pattern.
|
|
14
14
|
EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
|
|
15
15
|
|
|
16
|
+
# Another horrible pattern. Now for abbreviations.
|
|
17
|
+
ABBREV = /\b(\p{L}\.)+/i
|
|
18
|
+
|
|
16
19
|
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
|
17
20
|
# and this code should be rewritten to use the URI concept.
|
|
18
21
|
#
|
|
@@ -34,6 +37,16 @@ module Greeb::Parser
|
|
|
34
37
|
scan(text, EMAIL, :email)
|
|
35
38
|
end
|
|
36
39
|
|
|
40
|
+
# Recognize abbreviations in the input text.
|
|
41
|
+
#
|
|
42
|
+
# @param text [String] input text.
|
|
43
|
+
#
|
|
44
|
+
# @return [Array<Greeb::Entity>] found abbreviations.
|
|
45
|
+
#
|
|
46
|
+
def abbrevs(text)
|
|
47
|
+
scan(text, ABBREV, :abbrev)
|
|
48
|
+
end
|
|
49
|
+
|
|
37
50
|
private
|
|
38
51
|
# Implementation of regexp-based {Greeb::Entity} scanner.
|
|
39
52
|
#
|
data/lib/greeb/tokenizer.rb
CHANGED
|
@@ -57,6 +57,20 @@ module Greeb::Tokenizer
|
|
|
57
57
|
scanner.terminate
|
|
58
58
|
end
|
|
59
59
|
|
|
60
|
+
# Split one line into characters array, but also combine duplicated
|
|
61
|
+
# characters.
|
|
62
|
+
#
|
|
63
|
+
# For instance, `"a b\n\n\nc"` would be transformed into the following
|
|
64
|
+
# array: `["a", " ", "b", "\n\n\n", "c"]`.
|
|
65
|
+
#
|
|
66
|
+
# @param token [String] a token to be splitted.
|
|
67
|
+
#
|
|
68
|
+
# @return [Array<String>] splitted characters.
|
|
69
|
+
#
|
|
70
|
+
def split(token)
|
|
71
|
+
token.scan(/((.|\n)\2*)/).map(&:first)
|
|
72
|
+
end
|
|
73
|
+
|
|
60
74
|
protected
|
|
61
75
|
# One iteration of the tokenization process.
|
|
62
76
|
#
|
|
@@ -115,18 +129,4 @@ module Greeb::Tokenizer
|
|
|
115
129
|
before + s.length
|
|
116
130
|
end
|
|
117
131
|
end
|
|
118
|
-
|
|
119
|
-
# Split one line into characters array, but also combine line breaks
|
|
120
|
-
# into single elements.
|
|
121
|
-
#
|
|
122
|
-
# For instance, `"a b\n\n\nc"` would be transformed into the following
|
|
123
|
-
# array: `["a", " ", "b", "\n\n\n", "c"]`.
|
|
124
|
-
#
|
|
125
|
-
# @param token [String] a token to be splitted.
|
|
126
|
-
#
|
|
127
|
-
# @return [Array<String>] splitted characters.
|
|
128
|
-
#
|
|
129
|
-
def split(token)
|
|
130
|
-
token.scan(/((.|\n)\2*)/).map(&:first)
|
|
131
|
-
end
|
|
132
132
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/parser_spec.rb
CHANGED
|
@@ -5,9 +5,9 @@ require_relative 'spec_helper'
|
|
|
5
5
|
module Greeb
|
|
6
6
|
describe Parser do
|
|
7
7
|
let(:text) do
|
|
8
|
-
'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
|
|
8
|
+
'Hello there! My name is Vasya B. My website is: http://вася.рф/. ' \
|
|
9
9
|
'And my e-mail is example@example.com! Also it is available by ' \
|
|
10
|
-
'URL: http://vasya.ru.'
|
|
10
|
+
'URL: http://vasya.ru. Also, G.L.H.F. everyone!'
|
|
11
11
|
end
|
|
12
12
|
|
|
13
13
|
describe 'URL' do
|
|
@@ -15,8 +15,8 @@ module Greeb
|
|
|
15
15
|
|
|
16
16
|
it 'recognizes URLs' do
|
|
17
17
|
subject.must_equal(
|
|
18
|
-
[Entity.new(
|
|
19
|
-
Entity.new(
|
|
18
|
+
[Entity.new(48, 63, :url),
|
|
19
|
+
Entity.new(132, 147, :url)]
|
|
20
20
|
)
|
|
21
21
|
end
|
|
22
22
|
end
|
|
@@ -26,7 +26,18 @@ module Greeb
|
|
|
26
26
|
|
|
27
27
|
it 'recognizes e-mails' do
|
|
28
28
|
subject.must_equal(
|
|
29
|
-
[Entity.new(
|
|
29
|
+
[Entity.new(82, 101, :email)]
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
describe 'ABBREV' do
|
|
35
|
+
subject { Parser.abbrevs(text) }
|
|
36
|
+
|
|
37
|
+
it 'recognizes abbreviations' do
|
|
38
|
+
subject.must_equal(
|
|
39
|
+
[Entity.new(30, 32, :abbrev),
|
|
40
|
+
Entity.new(155, 163, :abbrev)]
|
|
30
41
|
)
|
|
31
42
|
end
|
|
32
43
|
end
|
data/spec/support/invoker.rb
CHANGED
data/spec/tokenizer_spec.rb
CHANGED
|
@@ -79,5 +79,20 @@ module Greeb
|
|
|
79
79
|
)
|
|
80
80
|
end
|
|
81
81
|
end
|
|
82
|
+
|
|
83
|
+
describe '.split' do
|
|
84
|
+
it 'should split characters' do
|
|
85
|
+
Tokenizer.split('loh').must_equal %w(l o h)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
it 'should combine duplicated characters' do
|
|
89
|
+
Tokenizer.split('foo').must_equal %w(f oo)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'should also deal with line breaks' do
|
|
93
|
+
Tokenizer.split("bar\n\nbaz").must_equal(
|
|
94
|
+
[*%w(b a r), "\n\n", *%w(b a z)])
|
|
95
|
+
end
|
|
96
|
+
end
|
|
82
97
|
end
|
|
83
98
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: greeb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.0.
|
|
4
|
+
version: 0.2.0.rc2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dmitry Ustalov
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2013-05-
|
|
11
|
+
date: 2013-05-22 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|
|
@@ -28,16 +28,16 @@ dependencies:
|
|
|
28
28
|
name: minitest
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- -
|
|
31
|
+
- - ~>
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: '
|
|
33
|
+
version: '5.0'
|
|
34
34
|
type: :development
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- -
|
|
38
|
+
- - ~>
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
|
-
version: '
|
|
40
|
+
version: '5.0'
|
|
41
41
|
description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
|
|
42
42
|
written in Ruby.
|
|
43
43
|
email:
|