greeb 0.2.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +8 -8
- data/bin/greeb +34 -2
- data/lib/greeb.rb +0 -2
- data/lib/greeb/core.rb +2 -2
- data/lib/greeb/parser.rb +61 -0
- data/lib/greeb/segmentator.rb +5 -9
- data/lib/greeb/span.rb +10 -0
- data/lib/greeb/tokenizer.rb +2 -0
- data/lib/greeb/version.rb +1 -1
- data/spec/bin_spec.rb +8 -0
- data/spec/parser_spec.rb +23 -1
- data/spec/span_spec.rb +10 -0
- data/spec/support/invoker.rb +5 -1
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07673b32254cd2b0ab0edf0664fa59e46231dbe3
|
4
|
+
data.tar.gz: f8eaac92c0fd4d7dda99c4441117b5e2b34c5caa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ebfda44f713c3dcda9df0439f073a1c075360c14cc41e99c400db8eec25f06033ba2d788b5c4b7b8715eeb5cc085e6c704f6e9bad08bfd6af7b4bc4d051a8c32
|
7
|
+
data.tar.gz: cb60f13ddad1e17a7cdbbd19add866677f92590cb7072dd5dc3cf70b80c93a31e0e857366908ff214bb0cc5b2c585415abd78338bd8479ea3150ebf58f5d2117
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Greeb
|
2
2
|
Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
|
3
|
-
that is based on regular expressions. API documentation is available
|
4
|
-
<http://rubydoc.info/github/dmchk/greeb/master/frames>.
|
3
|
+
that is based on regular expressions. The API documentation is available
|
4
|
+
at <http://rubydoc.info/github/dmchk/greeb/master/frames>.
|
5
5
|
|
6
6
|
## Installation
|
7
7
|
Add this line to your application's Gemfile:
|
@@ -134,12 +134,12 @@ addresses. Greeb can help you in these strings retrieval.
|
|
134
134
|
```ruby
|
135
135
|
text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
|
136
136
|
|
137
|
-
pp Greeb::Parser.urls(text).map { |e| [e,
|
137
|
+
pp Greeb::Parser.urls(text).map { |e| [e, e.slice(text)] }
|
138
138
|
=begin
|
139
139
|
[[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
|
140
140
|
=end
|
141
141
|
|
142
|
-
pp Greeb::Parser.emails(text).map { |e| [e,
|
142
|
+
pp Greeb::Parser.emails(text).map { |e| [e, e.slice(text)] }
|
143
143
|
=begin
|
144
144
|
[[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
|
145
145
|
=end
|
@@ -151,7 +151,7 @@ Please don't use Greeb in spam lists development purposes.
|
|
151
151
|
```ruby
|
152
152
|
text = 'Hello, G.L.H.F. everyone!'
|
153
153
|
|
154
|
-
pp Greeb::Parser.abbrevs(text).map { |e| [e,
|
154
|
+
pp Greeb::Parser.abbrevs(text).map { |e| [e, e.slice(text)] }
|
155
155
|
=begin
|
156
156
|
[[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
|
157
157
|
=end
|
@@ -164,7 +164,7 @@ situations.
|
|
164
164
|
```ruby
|
165
165
|
text = 'Our time is running out: 13:37 or 14:89.'
|
166
166
|
|
167
|
-
pp Greeb::Parser.time(text).map { |e| [e,
|
167
|
+
pp Greeb::Parser.time(text).map { |e| [e, e.slice(text)] }
|
168
168
|
=begin
|
169
169
|
[[#<struct Greeb::Span from=25, to=30, type=:time>, "13:37"]]
|
170
170
|
=end
|
@@ -194,6 +194,6 @@ There are several span types at the tokenization stage: `:letter`,
|
|
194
194
|
|
195
195
|
## Copyright
|
196
196
|
|
197
|
-
Copyright (c) 2010-
|
197
|
+
Copyright (c) 2010-2014 [Dmitry Ustalov]. See LICENSE for details.
|
198
198
|
|
199
|
-
[Dmitry Ustalov]: http://
|
199
|
+
[Dmitry Ustalov]: http://ustalov.name/
|
data/bin/greeb
CHANGED
@@ -1,13 +1,45 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'ostruct'
|
4
|
+
require 'optparse'
|
5
|
+
|
3
6
|
if File.exists? File.expand_path('../../.git', __FILE__)
|
4
7
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
5
8
|
end
|
6
9
|
|
7
10
|
require 'greeb'
|
8
11
|
|
9
|
-
|
12
|
+
options = OpenStruct.new(input: STDIN, output: STDOUT)
|
13
|
+
|
14
|
+
optparse = OptionParser.new do |opts|
|
15
|
+
opts.banner = 'Usage: %s [options] command' % $PROGRAM_NAME
|
16
|
+
|
17
|
+
opts.on '-i', '--input [file]', 'Input file' do |input|
|
18
|
+
options.input = File.open(input)
|
19
|
+
at_exit { options.input.close }
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on '-o', '--output [file]', 'Output file' do |output|
|
23
|
+
options.output = File.open(output)
|
24
|
+
at_exit { options.output.close }
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on_tail '-h', '--help', 'Just display this help' do
|
28
|
+
puts opts
|
29
|
+
exit
|
30
|
+
end
|
31
|
+
|
32
|
+
opts.on_tail '-v', '--version', 'Just print the version infomation' do
|
33
|
+
puts 'Greeb %s' % Greeb::VERSION
|
34
|
+
exit
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
optparse.parse!
|
39
|
+
|
40
|
+
text = options.input.read.tap(&:chomp!)
|
10
41
|
|
11
42
|
Greeb[text].each do |span|
|
12
|
-
|
43
|
+
next if [:space, :break].include? span.type
|
44
|
+
options.output.puts text[span.from...span.to]
|
13
45
|
end
|
data/lib/greeb.rb
CHANGED
data/lib/greeb/core.rb
CHANGED
@@ -11,7 +11,8 @@ module Greeb::Core
|
|
11
11
|
|
12
12
|
# Recognize e-mail addresses in the input text.
|
13
13
|
#
|
14
|
-
# @param text [String] input text.
|
14
|
+
# @param text [String] an input text.
|
15
|
+
# @param helpers [Array<Symbol>] a set of helper identifiers.
|
15
16
|
#
|
16
17
|
# @return [Array<Greeb::Span>] a set of tokens.
|
17
18
|
#
|
@@ -27,7 +28,6 @@ module Greeb::Core
|
|
27
28
|
|
28
29
|
alias_method :'[]', :analyze
|
29
30
|
|
30
|
-
protected
|
31
31
|
# Extact spans of the specified type from the input spans set.
|
32
32
|
#
|
33
33
|
# @param spans [Array<Greeb::Span>] input spans set.
|
data/lib/greeb/parser.rb
CHANGED
@@ -8,20 +8,33 @@ module Greeb::Parser
|
|
8
8
|
extend self
|
9
9
|
|
10
10
|
# An URL pattern. Not so precise, but IDN-compatible.
|
11
|
+
#
|
11
12
|
URL = %r{\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|/)))}i
|
12
13
|
|
13
14
|
# A horrible e-mail pattern.
|
15
|
+
#
|
14
16
|
EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
|
15
17
|
|
16
18
|
# Another horrible pattern. Now for abbreviations.
|
19
|
+
#
|
17
20
|
ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
|
18
21
|
|
19
22
|
# This pattern matches anything that looks like HTML. Or not.
|
23
|
+
#
|
20
24
|
HTML = /<(.*?)>/i
|
21
25
|
|
22
26
|
# Time pattern.
|
27
|
+
#
|
23
28
|
TIME = /\b(\d|[0-2]\d):[0-6]\d(:[0-6]\d){0,1}\b/i
|
24
29
|
|
30
|
+
# Apostrophe pattern.
|
31
|
+
#
|
32
|
+
APOSTROPHE = /['’]/i
|
33
|
+
|
34
|
+
# Together pattern.
|
35
|
+
#
|
36
|
+
TOGETHER = [:letter, :integer, :apostrophe, :together]
|
37
|
+
|
25
38
|
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
26
39
|
# and this code should be rewritten to use the URI concept.
|
27
40
|
#
|
@@ -73,6 +86,54 @@ module Greeb::Parser
|
|
73
86
|
scan(text, TIME, :time)
|
74
87
|
end
|
75
88
|
|
89
|
+
# Retrieve apostrophes from the tokenized text. The algorithm may be
|
90
|
+
# more optimal.
|
91
|
+
#
|
92
|
+
# @param text [String] input text.
|
93
|
+
# @param spans [Array<Greeb::Span>] already tokenized text.
|
94
|
+
#
|
95
|
+
# @return [Array<Greeb::Span>] retrieved apostrophes.
|
96
|
+
#
|
97
|
+
def apostrophes(text, spans)
|
98
|
+
apostrophes = scan(text, APOSTROPHE, :apostrophe)
|
99
|
+
return [] if apostrophes.empty?
|
100
|
+
|
101
|
+
apostrophes.each { |s| Greeb.extract_spans(spans, s) }.clear
|
102
|
+
|
103
|
+
spans.each_with_index.each_cons(3).reverse_each do |(s1, i), (s2, j), (s3, k)|
|
104
|
+
next unless s1 && s1.type == :letter
|
105
|
+
next unless s2 && s2.type == :apostrophe
|
106
|
+
next unless !s3 || s3 && s3.type == :letter
|
107
|
+
s3, k = s2, j unless s3
|
108
|
+
apostrophes << Greeb::Span.new(s1.from, s3.to, s1.type)
|
109
|
+
spans[i..k] = apostrophes.last
|
110
|
+
end
|
111
|
+
|
112
|
+
apostrophes
|
113
|
+
end
|
114
|
+
|
115
|
+
# Merge some spans that are together.
|
116
|
+
#
|
117
|
+
# @param spans [Array<Greeb::Span>] already tokenized text.
|
118
|
+
#
|
119
|
+
# @return [Array<Greeb::Span>] merged spans.
|
120
|
+
#
|
121
|
+
def together(spans)
|
122
|
+
loop do
|
123
|
+
converged = true
|
124
|
+
|
125
|
+
spans.each_with_index.each_cons(2).reverse_each do |(s1, i), (s2, j)|
|
126
|
+
next unless TOGETHER.include?(s1.type) && TOGETHER.include?(s2.type)
|
127
|
+
spans[i..j] = Greeb::Span.new(s1.from, s2.to, :together)
|
128
|
+
converged = false
|
129
|
+
end
|
130
|
+
|
131
|
+
break if converged
|
132
|
+
end
|
133
|
+
|
134
|
+
spans
|
135
|
+
end
|
136
|
+
|
76
137
|
private
|
77
138
|
# Implementation of regexp-based {Greeb::Span} scanner.
|
78
139
|
#
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -55,12 +55,12 @@ class Greeb::Segmentator
|
|
55
55
|
# process.
|
56
56
|
# @param stop_marks [Array<Symbol>] an array that stores the
|
57
57
|
# correspondent stop marks of the necessary spans.
|
58
|
+
# @param collection [Array<Greeb::Span>] an initial set of spans
|
59
|
+
# to be populated.
|
58
60
|
#
|
59
|
-
# @return [Array<Greeb::Span>] a
|
61
|
+
# @return [Array<Greeb::Span>] a modified collection.
|
60
62
|
#
|
61
|
-
def detect_spans(sample, stop_marks)
|
62
|
-
collection = []
|
63
|
-
|
63
|
+
def detect_spans(sample, stop_marks, collection = [])
|
64
64
|
rest = tokens.inject(sample.dup) do |span, token|
|
65
65
|
next span if sentence_aint_start? span, token
|
66
66
|
span.from = token.from unless span.from
|
@@ -77,11 +77,7 @@ class Greeb::Segmentator
|
|
77
77
|
span
|
78
78
|
end
|
79
79
|
|
80
|
-
|
81
|
-
collection << rest
|
82
|
-
else
|
83
|
-
collection
|
84
|
-
end
|
80
|
+
rest.from && rest.to ? collection << rest : collection
|
85
81
|
end
|
86
82
|
|
87
83
|
private
|
data/lib/greeb/span.rb
CHANGED
@@ -19,6 +19,16 @@ class Greeb::Span < Struct.new(:from, :to, :type)
|
|
19
19
|
Struct.new(*self.members, *members)
|
20
20
|
end
|
21
21
|
|
22
|
+
# Select the slice of the given text using coorinates of this span.
|
23
|
+
#
|
24
|
+
# @param text [String] a text to be extracted.
|
25
|
+
#
|
26
|
+
# @return [String] the retrieved substring.
|
27
|
+
#
|
28
|
+
def slice(text)
|
29
|
+
text[from...to]
|
30
|
+
end
|
31
|
+
|
22
32
|
# @private
|
23
33
|
def <=> other
|
24
34
|
if (comparison = self.from <=> other.from) == 0
|
data/lib/greeb/tokenizer.rb
CHANGED
data/lib/greeb/version.rb
CHANGED
data/spec/bin_spec.rb
CHANGED
@@ -21,4 +21,12 @@ describe 'CLI' do
|
|
21
21
|
invoke(stdin: 'Hello example@example.com guys!').must_equal(
|
22
22
|
%w(Hello example@example.com guys !))
|
23
23
|
end
|
24
|
+
|
25
|
+
it 'should print version' do
|
26
|
+
invoke('-v').join.must_match(/\AGreeb (\d\.)+\d\z/)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should print help' do
|
30
|
+
invoke('-h').join.must_match(/Usage/)
|
31
|
+
end
|
24
32
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -8,9 +8,11 @@ describe Parser do
|
|
8
8
|
'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
|
9
9
|
'example@example.com! It is available by URL: http://vasya.ru. ' \
|
10
10
|
'Also, <b>G.L.H.F.</b> everyone! It\'s 13:37 or 00:02:28 right ' \
|
11
|
-
'now, not 14:89.').freeze
|
11
|
+
'now, not 14:89. What about some Nagibator228?').freeze
|
12
12
|
end
|
13
13
|
|
14
|
+
let(:spans) { Tokenizer.tokenize(text) }
|
15
|
+
|
14
16
|
describe 'URL' do
|
15
17
|
subject { Parser.urls(text) }
|
16
18
|
|
@@ -67,4 +69,24 @@ describe Parser do
|
|
67
69
|
)
|
68
70
|
end
|
69
71
|
end
|
72
|
+
|
73
|
+
describe 'APOSTROPHE' do
|
74
|
+
subject { Parser.apostrophes(text, spans.dup) }
|
75
|
+
|
76
|
+
it 'recognizes apostrophes' do
|
77
|
+
subject.must_equal(
|
78
|
+
[Span.new(220, 224, :letter)]
|
79
|
+
)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe 'TOGETHER' do
|
84
|
+
subject { Parser.together(spans.dup) }
|
85
|
+
|
86
|
+
it 'merges connected spans' do
|
87
|
+
subject.select { |s| s.type == :together }.must_equal(
|
88
|
+
[Span.new(281, 293, :together)]
|
89
|
+
)
|
90
|
+
end
|
91
|
+
end
|
70
92
|
end
|
data/spec/span_spec.rb
CHANGED
@@ -60,4 +60,14 @@ describe Span do
|
|
60
60
|
Span.new(1, 2, 3).wont_equal Span.new(1, 2, 4)
|
61
61
|
end
|
62
62
|
end
|
63
|
+
|
64
|
+
describe 'slicing' do
|
65
|
+
let(:text) { 'test228' }
|
66
|
+
|
67
|
+
subject { Span.new(4, 7) }
|
68
|
+
|
69
|
+
it 'should extract slices using #slice' do
|
70
|
+
subject.slice(text).must_equal '228'
|
71
|
+
end
|
72
|
+
end
|
63
73
|
end
|
data/spec/support/invoker.rb
CHANGED
@@ -19,11 +19,15 @@ class MiniTest::Test
|
|
19
19
|
arguments = argv.dup
|
20
20
|
options = (arguments.last.is_a? Hash) ? arguments.pop : {}
|
21
21
|
executable = File.expand_path('../../../bin/greeb', __FILE__)
|
22
|
+
status = nil
|
22
23
|
|
23
|
-
Open3.popen3(executable, *arguments) do |i, o,
|
24
|
+
Open3.popen3(executable, *arguments) do |i, o, _, t|
|
24
25
|
i.puts options[:stdin] if options[:stdin]
|
25
26
|
i.close
|
26
27
|
invoke_cache[argv] = o.readlines.map(&:chomp!)
|
28
|
+
status = t.value
|
27
29
|
end
|
30
|
+
|
31
|
+
invoke_cache[argv] if status.success?
|
28
32
|
end
|
29
33
|
end
|
metadata
CHANGED
@@ -1,27 +1,27 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-05-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: minitest
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '5.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '5.0'
|
27
27
|
description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
|
@@ -33,8 +33,8 @@ executables:
|
|
33
33
|
extensions: []
|
34
34
|
extra_rdoc_files: []
|
35
35
|
files:
|
36
|
-
- .gitignore
|
37
|
-
- .travis.yml
|
36
|
+
- ".gitignore"
|
37
|
+
- ".travis.yml"
|
38
38
|
- Gemfile
|
39
39
|
- LICENSE
|
40
40
|
- README.md
|
@@ -68,17 +68,17 @@ require_paths:
|
|
68
68
|
- lib
|
69
69
|
required_ruby_version: !ruby/object:Gem::Requirement
|
70
70
|
requirements:
|
71
|
-
- -
|
71
|
+
- - ">="
|
72
72
|
- !ruby/object:Gem::Version
|
73
73
|
version: '0'
|
74
74
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
75
75
|
requirements:
|
76
|
-
- -
|
76
|
+
- - ">="
|
77
77
|
- !ruby/object:Gem::Version
|
78
78
|
version: '0'
|
79
79
|
requirements: []
|
80
80
|
rubyforge_project: greeb
|
81
|
-
rubygems_version: 2.
|
81
|
+
rubygems_version: 2.2.2
|
82
82
|
signing_key:
|
83
83
|
specification_version: 4
|
84
84
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|