greeb 0.2.2.1 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +8 -8
- data/bin/greeb +34 -2
- data/lib/greeb.rb +0 -2
- data/lib/greeb/core.rb +2 -2
- data/lib/greeb/parser.rb +61 -0
- data/lib/greeb/segmentator.rb +5 -9
- data/lib/greeb/span.rb +10 -0
- data/lib/greeb/tokenizer.rb +2 -0
- data/lib/greeb/version.rb +1 -1
- data/spec/bin_spec.rb +8 -0
- data/spec/parser_spec.rb +23 -1
- data/spec/span_spec.rb +10 -0
- data/spec/support/invoker.rb +5 -1
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07673b32254cd2b0ab0edf0664fa59e46231dbe3
|
4
|
+
data.tar.gz: f8eaac92c0fd4d7dda99c4441117b5e2b34c5caa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ebfda44f713c3dcda9df0439f073a1c075360c14cc41e99c400db8eec25f06033ba2d788b5c4b7b8715eeb5cc085e6c704f6e9bad08bfd6af7b4bc4d051a8c32
|
7
|
+
data.tar.gz: cb60f13ddad1e17a7cdbbd19add866677f92590cb7072dd5dc3cf70b80c93a31e0e857366908ff214bb0cc5b2c585415abd78338bd8479ea3150ebf58f5d2117
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Greeb
|
2
2
|
Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
|
3
|
-
that is based on regular expressions. API documentation is available
|
4
|
-
<http://rubydoc.info/github/dmchk/greeb/master/frames>.
|
3
|
+
that is based on regular expressions. The API documentation is available
|
4
|
+
at <http://rubydoc.info/github/dmchk/greeb/master/frames>.
|
5
5
|
|
6
6
|
## Installation
|
7
7
|
Add this line to your application's Gemfile:
|
@@ -134,12 +134,12 @@ addresses. Greeb can help you in these strings retrieval.
|
|
134
134
|
```ruby
|
135
135
|
text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
|
136
136
|
|
137
|
-
pp Greeb::Parser.urls(text).map { |e| [e,
|
137
|
+
pp Greeb::Parser.urls(text).map { |e| [e, e.slice(text)] }
|
138
138
|
=begin
|
139
139
|
[[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
|
140
140
|
=end
|
141
141
|
|
142
|
-
pp Greeb::Parser.emails(text).map { |e| [e,
|
142
|
+
pp Greeb::Parser.emails(text).map { |e| [e, e.slice(text)] }
|
143
143
|
=begin
|
144
144
|
[[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
|
145
145
|
=end
|
@@ -151,7 +151,7 @@ Please don't use Greeb in spam lists development purposes.
|
|
151
151
|
```ruby
|
152
152
|
text = 'Hello, G.L.H.F. everyone!'
|
153
153
|
|
154
|
-
pp Greeb::Parser.abbrevs(text).map { |e| [e,
|
154
|
+
pp Greeb::Parser.abbrevs(text).map { |e| [e, e.slice(text)] }
|
155
155
|
=begin
|
156
156
|
[[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
|
157
157
|
=end
|
@@ -164,7 +164,7 @@ situations.
|
|
164
164
|
```ruby
|
165
165
|
text = 'Our time is running out: 13:37 or 14:89.'
|
166
166
|
|
167
|
-
pp Greeb::Parser.time(text).map { |e| [e,
|
167
|
+
pp Greeb::Parser.time(text).map { |e| [e, e.slice(text)] }
|
168
168
|
=begin
|
169
169
|
[[#<struct Greeb::Span from=25, to=30, type=:time>, "13:37"]]
|
170
170
|
=end
|
@@ -194,6 +194,6 @@ There are several span types at the tokenization stage: `:letter`,
|
|
194
194
|
|
195
195
|
## Copyright
|
196
196
|
|
197
|
-
Copyright (c) 2010-
|
197
|
+
Copyright (c) 2010-2014 [Dmitry Ustalov]. See LICENSE for details.
|
198
198
|
|
199
|
-
[Dmitry Ustalov]: http://
|
199
|
+
[Dmitry Ustalov]: http://ustalov.name/
|
data/bin/greeb
CHANGED
@@ -1,13 +1,45 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'ostruct'
|
4
|
+
require 'optparse'
|
5
|
+
|
3
6
|
if File.exists? File.expand_path('../../.git', __FILE__)
|
4
7
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
5
8
|
end
|
6
9
|
|
7
10
|
require 'greeb'
|
8
11
|
|
9
|
-
|
12
|
+
options = OpenStruct.new(input: STDIN, output: STDOUT)
|
13
|
+
|
14
|
+
optparse = OptionParser.new do |opts|
|
15
|
+
opts.banner = 'Usage: %s [options] command' % $PROGRAM_NAME
|
16
|
+
|
17
|
+
opts.on '-i', '--input [file]', 'Input file' do |input|
|
18
|
+
options.input = File.open(input)
|
19
|
+
at_exit { options.input.close }
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on '-o', '--output [file]', 'Output file' do |output|
|
23
|
+
options.output = File.open(output)
|
24
|
+
at_exit { options.output.close }
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on_tail '-h', '--help', 'Just display this help' do
|
28
|
+
puts opts
|
29
|
+
exit
|
30
|
+
end
|
31
|
+
|
32
|
+
opts.on_tail '-v', '--version', 'Just print the version infomation' do
|
33
|
+
puts 'Greeb %s' % Greeb::VERSION
|
34
|
+
exit
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
optparse.parse!
|
39
|
+
|
40
|
+
text = options.input.read.tap(&:chomp!)
|
10
41
|
|
11
42
|
Greeb[text].each do |span|
|
12
|
-
|
43
|
+
next if [:space, :break].include? span.type
|
44
|
+
options.output.puts text[span.from...span.to]
|
13
45
|
end
|
data/lib/greeb.rb
CHANGED
data/lib/greeb/core.rb
CHANGED
@@ -11,7 +11,8 @@ module Greeb::Core
|
|
11
11
|
|
12
12
|
# Recognize e-mail addresses in the input text.
|
13
13
|
#
|
14
|
-
# @param text [String] input text.
|
14
|
+
# @param text [String] an input text.
|
15
|
+
# @param helpers [Array<Symbol>] a set of helper identifiers.
|
15
16
|
#
|
16
17
|
# @return [Array<Greeb::Span>] a set of tokens.
|
17
18
|
#
|
@@ -27,7 +28,6 @@ module Greeb::Core
|
|
27
28
|
|
28
29
|
alias_method :'[]', :analyze
|
29
30
|
|
30
|
-
protected
|
31
31
|
# Extact spans of the specified type from the input spans set.
|
32
32
|
#
|
33
33
|
# @param spans [Array<Greeb::Span>] input spans set.
|
data/lib/greeb/parser.rb
CHANGED
@@ -8,20 +8,33 @@ module Greeb::Parser
|
|
8
8
|
extend self
|
9
9
|
|
10
10
|
# An URL pattern. Not so precise, but IDN-compatible.
|
11
|
+
#
|
11
12
|
URL = %r{\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|/)))}i
|
12
13
|
|
13
14
|
# A horrible e-mail pattern.
|
15
|
+
#
|
14
16
|
EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
|
15
17
|
|
16
18
|
# Another horrible pattern. Now for abbreviations.
|
19
|
+
#
|
17
20
|
ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
|
18
21
|
|
19
22
|
# This pattern matches anything that looks like HTML. Or not.
|
23
|
+
#
|
20
24
|
HTML = /<(.*?)>/i
|
21
25
|
|
22
26
|
# Time pattern.
|
27
|
+
#
|
23
28
|
TIME = /\b(\d|[0-2]\d):[0-6]\d(:[0-6]\d){0,1}\b/i
|
24
29
|
|
30
|
+
# Apostrophe pattern.
|
31
|
+
#
|
32
|
+
APOSTROPHE = /['’]/i
|
33
|
+
|
34
|
+
# Together pattern.
|
35
|
+
#
|
36
|
+
TOGETHER = [:letter, :integer, :apostrophe, :together]
|
37
|
+
|
25
38
|
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
26
39
|
# and this code should be rewritten to use the URI concept.
|
27
40
|
#
|
@@ -73,6 +86,54 @@ module Greeb::Parser
|
|
73
86
|
scan(text, TIME, :time)
|
74
87
|
end
|
75
88
|
|
89
|
+
# Retrieve apostrophes from the tokenized text. The algorithm may be
|
90
|
+
# more optimal.
|
91
|
+
#
|
92
|
+
# @param text [String] input text.
|
93
|
+
# @param spans [Array<Greeb::Span>] already tokenized text.
|
94
|
+
#
|
95
|
+
# @return [Array<Greeb::Span>] retrieved apostrophes.
|
96
|
+
#
|
97
|
+
def apostrophes(text, spans)
|
98
|
+
apostrophes = scan(text, APOSTROPHE, :apostrophe)
|
99
|
+
return [] if apostrophes.empty?
|
100
|
+
|
101
|
+
apostrophes.each { |s| Greeb.extract_spans(spans, s) }.clear
|
102
|
+
|
103
|
+
spans.each_with_index.each_cons(3).reverse_each do |(s1, i), (s2, j), (s3, k)|
|
104
|
+
next unless s1 && s1.type == :letter
|
105
|
+
next unless s2 && s2.type == :apostrophe
|
106
|
+
next unless !s3 || s3 && s3.type == :letter
|
107
|
+
s3, k = s2, j unless s3
|
108
|
+
apostrophes << Greeb::Span.new(s1.from, s3.to, s1.type)
|
109
|
+
spans[i..k] = apostrophes.last
|
110
|
+
end
|
111
|
+
|
112
|
+
apostrophes
|
113
|
+
end
|
114
|
+
|
115
|
+
# Merge some spans that are together.
|
116
|
+
#
|
117
|
+
# @param spans [Array<Greeb::Span>] already tokenized text.
|
118
|
+
#
|
119
|
+
# @return [Array<Greeb::Span>] merged spans.
|
120
|
+
#
|
121
|
+
def together(spans)
|
122
|
+
loop do
|
123
|
+
converged = true
|
124
|
+
|
125
|
+
spans.each_with_index.each_cons(2).reverse_each do |(s1, i), (s2, j)|
|
126
|
+
next unless TOGETHER.include?(s1.type) && TOGETHER.include?(s2.type)
|
127
|
+
spans[i..j] = Greeb::Span.new(s1.from, s2.to, :together)
|
128
|
+
converged = false
|
129
|
+
end
|
130
|
+
|
131
|
+
break if converged
|
132
|
+
end
|
133
|
+
|
134
|
+
spans
|
135
|
+
end
|
136
|
+
|
76
137
|
private
|
77
138
|
# Implementation of regexp-based {Greeb::Span} scanner.
|
78
139
|
#
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -55,12 +55,12 @@ class Greeb::Segmentator
|
|
55
55
|
# process.
|
56
56
|
# @param stop_marks [Array<Symbol>] an array that stores the
|
57
57
|
# correspondent stop marks of the necessary spans.
|
58
|
+
# @param collection [Array<Greeb::Span>] an initial set of spans
|
59
|
+
# to be populated.
|
58
60
|
#
|
59
|
-
# @return [Array<Greeb::Span>] a
|
61
|
+
# @return [Array<Greeb::Span>] a modified collection.
|
60
62
|
#
|
61
|
-
def detect_spans(sample, stop_marks)
|
62
|
-
collection = []
|
63
|
-
|
63
|
+
def detect_spans(sample, stop_marks, collection = [])
|
64
64
|
rest = tokens.inject(sample.dup) do |span, token|
|
65
65
|
next span if sentence_aint_start? span, token
|
66
66
|
span.from = token.from unless span.from
|
@@ -77,11 +77,7 @@ class Greeb::Segmentator
|
|
77
77
|
span
|
78
78
|
end
|
79
79
|
|
80
|
-
|
81
|
-
collection << rest
|
82
|
-
else
|
83
|
-
collection
|
84
|
-
end
|
80
|
+
rest.from && rest.to ? collection << rest : collection
|
85
81
|
end
|
86
82
|
|
87
83
|
private
|
data/lib/greeb/span.rb
CHANGED
@@ -19,6 +19,16 @@ class Greeb::Span < Struct.new(:from, :to, :type)
|
|
19
19
|
Struct.new(*self.members, *members)
|
20
20
|
end
|
21
21
|
|
22
|
+
# Select the slice of the given text using coorinates of this span.
|
23
|
+
#
|
24
|
+
# @param text [String] a text to be extracted.
|
25
|
+
#
|
26
|
+
# @return [String] the retrieved substring.
|
27
|
+
#
|
28
|
+
def slice(text)
|
29
|
+
text[from...to]
|
30
|
+
end
|
31
|
+
|
22
32
|
# @private
|
23
33
|
def <=> other
|
24
34
|
if (comparison = self.from <=> other.from) == 0
|
data/lib/greeb/tokenizer.rb
CHANGED
data/lib/greeb/version.rb
CHANGED
data/spec/bin_spec.rb
CHANGED
@@ -21,4 +21,12 @@ describe 'CLI' do
|
|
21
21
|
invoke(stdin: 'Hello example@example.com guys!').must_equal(
|
22
22
|
%w(Hello example@example.com guys !))
|
23
23
|
end
|
24
|
+
|
25
|
+
it 'should print version' do
|
26
|
+
invoke('-v').join.must_match(/\AGreeb (\d\.)+\d\z/)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should print help' do
|
30
|
+
invoke('-h').join.must_match(/Usage/)
|
31
|
+
end
|
24
32
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -8,9 +8,11 @@ describe Parser do
|
|
8
8
|
'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
|
9
9
|
'example@example.com! It is available by URL: http://vasya.ru. ' \
|
10
10
|
'Also, <b>G.L.H.F.</b> everyone! It\'s 13:37 or 00:02:28 right ' \
|
11
|
-
'now, not 14:89.').freeze
|
11
|
+
'now, not 14:89. What about some Nagibator228?').freeze
|
12
12
|
end
|
13
13
|
|
14
|
+
let(:spans) { Tokenizer.tokenize(text) }
|
15
|
+
|
14
16
|
describe 'URL' do
|
15
17
|
subject { Parser.urls(text) }
|
16
18
|
|
@@ -67,4 +69,24 @@ describe Parser do
|
|
67
69
|
)
|
68
70
|
end
|
69
71
|
end
|
72
|
+
|
73
|
+
describe 'APOSTROPHE' do
|
74
|
+
subject { Parser.apostrophes(text, spans.dup) }
|
75
|
+
|
76
|
+
it 'recognizes apostrophes' do
|
77
|
+
subject.must_equal(
|
78
|
+
[Span.new(220, 224, :letter)]
|
79
|
+
)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe 'TOGETHER' do
|
84
|
+
subject { Parser.together(spans.dup) }
|
85
|
+
|
86
|
+
it 'merges connected spans' do
|
87
|
+
subject.select { |s| s.type == :together }.must_equal(
|
88
|
+
[Span.new(281, 293, :together)]
|
89
|
+
)
|
90
|
+
end
|
91
|
+
end
|
70
92
|
end
|
data/spec/span_spec.rb
CHANGED
@@ -60,4 +60,14 @@ describe Span do
|
|
60
60
|
Span.new(1, 2, 3).wont_equal Span.new(1, 2, 4)
|
61
61
|
end
|
62
62
|
end
|
63
|
+
|
64
|
+
describe 'slicing' do
|
65
|
+
let(:text) { 'test228' }
|
66
|
+
|
67
|
+
subject { Span.new(4, 7) }
|
68
|
+
|
69
|
+
it 'should extract slices using #slice' do
|
70
|
+
subject.slice(text).must_equal '228'
|
71
|
+
end
|
72
|
+
end
|
63
73
|
end
|
data/spec/support/invoker.rb
CHANGED
@@ -19,11 +19,15 @@ class MiniTest::Test
|
|
19
19
|
arguments = argv.dup
|
20
20
|
options = (arguments.last.is_a? Hash) ? arguments.pop : {}
|
21
21
|
executable = File.expand_path('../../../bin/greeb', __FILE__)
|
22
|
+
status = nil
|
22
23
|
|
23
|
-
Open3.popen3(executable, *arguments) do |i, o,
|
24
|
+
Open3.popen3(executable, *arguments) do |i, o, _, t|
|
24
25
|
i.puts options[:stdin] if options[:stdin]
|
25
26
|
i.close
|
26
27
|
invoke_cache[argv] = o.readlines.map(&:chomp!)
|
28
|
+
status = t.value
|
27
29
|
end
|
30
|
+
|
31
|
+
invoke_cache[argv] if status.success?
|
28
32
|
end
|
29
33
|
end
|
metadata
CHANGED
@@ -1,27 +1,27 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-05-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: minitest
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '5.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '5.0'
|
27
27
|
description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
|
@@ -33,8 +33,8 @@ executables:
|
|
33
33
|
extensions: []
|
34
34
|
extra_rdoc_files: []
|
35
35
|
files:
|
36
|
-
- .gitignore
|
37
|
-
- .travis.yml
|
36
|
+
- ".gitignore"
|
37
|
+
- ".travis.yml"
|
38
38
|
- Gemfile
|
39
39
|
- LICENSE
|
40
40
|
- README.md
|
@@ -68,17 +68,17 @@ require_paths:
|
|
68
68
|
- lib
|
69
69
|
required_ruby_version: !ruby/object:Gem::Requirement
|
70
70
|
requirements:
|
71
|
-
- -
|
71
|
+
- - ">="
|
72
72
|
- !ruby/object:Gem::Version
|
73
73
|
version: '0'
|
74
74
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
75
75
|
requirements:
|
76
|
-
- -
|
76
|
+
- - ">="
|
77
77
|
- !ruby/object:Gem::Version
|
78
78
|
version: '0'
|
79
79
|
requirements: []
|
80
80
|
rubyforge_project: greeb
|
81
|
-
rubygems_version: 2.
|
81
|
+
rubygems_version: 2.2.2
|
82
82
|
signing_key:
|
83
83
|
specification_version: 4
|
84
84
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|