greeb 0.2.0.pre2 → 0.2.0.pre3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/.travis.yml +0 -1
- data/Gemfile +5 -1
- data/README.md +50 -14
- data/bin/greeb +4 -2
- data/greeb.gemspec +0 -2
- data/lib/greeb/parser.rb +4 -4
- data/lib/greeb/segmentator.rb +9 -11
- data/lib/greeb/version.rb +1 -1
- data/spec/bin_spec.rb +24 -0
- data/spec/segmentator_spec.rb +1 -1
- data/spec/spec_helper.rb +5 -6
- data/spec/support/invoker.rb +29 -0
- data/spec/tokenizer_spec.rb +1 -1
- metadata +8 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 618591e00b61f1df11f98bdd045bd650d34ba863
|
4
|
+
data.tar.gz: 88d1b8448e98c18e6d9759e4d992d2fbea7c1d63
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e8113e47988e80aabfc07314268a5f8220cce88edbf06bd69b35602623c0a310c3c460e300143943596decae621ee69b4909371b9f43a7d9225bceb336bf21f6
|
7
|
+
data.tar.gz: 7ebe3c3e0a603bf1fc0072376c3b2b544b43ae38e31e8bc5ff9e34fcaf362b8c474ba565db67363c09f10a2f4960fdb0bf7a165ee6c0b90d657b3914231cc07a
|
data/.rubocop.yml
ADDED
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,8 @@
|
|
1
|
-
Greeb
|
2
|
-
|
3
|
-
|
4
|
-
Greeb is a simple yet awesome and Unicode-aware text segmentator
|
1
|
+
# Greeb
|
2
|
+
Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
|
5
3
|
that is based on regular expressions.
|
6
4
|
|
7
5
|
## Installation
|
8
|
-
|
9
6
|
Add this line to your application's Gemfile:
|
10
7
|
|
11
8
|
```ruby
|
@@ -21,8 +18,26 @@ Or install it yourself as:
|
|
21
18
|
$ gem install greeb
|
22
19
|
|
23
20
|
## Usage
|
21
|
+
Greeb can help you solve simple text processing problems such as
|
22
|
+
tokenization and segmentation.
|
23
|
+
|
24
|
+
It is available as a command line application that reads the input
|
25
|
+
text from STDIN and prints one token per line into STDOUT.
|
26
|
+
|
27
|
+
```
|
28
|
+
% echo 'Hello http://nlpub.ru guys, how are you?' | greeb
|
29
|
+
Hello
|
30
|
+
http://nlpub.ru
|
31
|
+
guys
|
32
|
+
,
|
33
|
+
how
|
34
|
+
are
|
35
|
+
you
|
36
|
+
?
|
37
|
+
```
|
24
38
|
|
25
|
-
|
39
|
+
### Tokenization API
|
40
|
+
Greeb has a very convinient API that makes you happy.
|
26
41
|
|
27
42
|
```ruby
|
28
43
|
pp Greeb::Tokenizer.tokenize('Hello!')
|
@@ -32,7 +47,7 @@ pp Greeb::Tokenizer.tokenize('Hello!')
|
|
32
47
|
=end
|
33
48
|
```
|
34
49
|
|
35
|
-
It should be noted that it is possible to process much complex texts
|
50
|
+
It should be noted that it is possible to process much complex texts.
|
36
51
|
|
37
52
|
```ruby
|
38
53
|
text =<<-EOF
|
@@ -74,8 +89,9 @@ pp Greeb::Tokenizer.tokenize(text)
|
|
74
89
|
=end
|
75
90
|
```
|
76
91
|
|
92
|
+
### Segmentation API
|
77
93
|
Also it can be used to solve the text segmentation problems
|
78
|
-
such as sentence detection tasks
|
94
|
+
such as sentence detection tasks.
|
79
95
|
|
80
96
|
```ruby
|
81
97
|
text = 'Hello! How are you?'
|
@@ -88,7 +104,7 @@ pp Greeb::Segmentator.new(tokens).sentences
|
|
88
104
|
```
|
89
105
|
|
90
106
|
It is possible to extract tokens that were processed by the text
|
91
|
-
segmentator
|
107
|
+
segmentator.
|
92
108
|
|
93
109
|
```ruby
|
94
110
|
text = 'Hello! How are you?'
|
@@ -109,18 +125,36 @@ pp segmentator.extract(segmentator.sentences)
|
|
109
125
|
=end
|
110
126
|
```
|
111
127
|
|
112
|
-
|
128
|
+
### Parsing API
|
129
|
+
Texts are often include some special entities such as URLs and e-mail
|
130
|
+
addresses. Greeb can help you in these strings retrieval.
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
|
134
|
+
|
135
|
+
pp Greeb::Parser.urls(text).map { |e| [e, text[e.from...e.to]] }
|
136
|
+
=begin
|
137
|
+
[[#<struct Greeb::Entity from=14, to=29, type=:url>, "http://nlpub.ru"]]
|
138
|
+
=end
|
139
|
+
|
140
|
+
pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
|
141
|
+
=begin
|
142
|
+
[[#<struct Greeb::Entity from=44, to=63, type=:email>, "example@example.com"]]
|
143
|
+
=end
|
144
|
+
```
|
145
|
+
|
146
|
+
Please don't use Greeb in spam lists development purposes.
|
113
147
|
|
148
|
+
## Tokens
|
114
149
|
Greeb operates with entities, tuples of *(from, to, kind)*, where
|
115
150
|
*from* is a beginning of the entity, *to* is an ending of the entity,
|
116
151
|
and *kind* is a type of the entity.
|
117
152
|
|
118
|
-
There are several entity types: `:letter`,
|
119
|
-
`:separ`, `:punct` (for punctuation), `:spunct`
|
120
|
-
punctuation), and `:break`.
|
153
|
+
There are several entity types at the tokenization stage: `:letter`,
|
154
|
+
`:float`, `:integer`, `:separ`, `:punct` (for punctuation), `:spunct`
|
155
|
+
(for in-sentence punctuation), and `:break`.
|
121
156
|
|
122
157
|
## Contributing
|
123
|
-
|
124
158
|
1. Fork it;
|
125
159
|
2. Create your feature branch (`git checkout -b my-new-feature`);
|
126
160
|
3. Commit your changes (`git commit -am 'Added some feature'`);
|
@@ -131,6 +165,8 @@ punctuation), and `:break`.
|
|
131
165
|
|
132
166
|
## Dependency Status [<img src="https://gemnasium.com/ustalov/greeb.png"/>](https://gemnasium.com/ustalov/greeb)
|
133
167
|
|
168
|
+
## Code Climate [<img src="https://codeclimate.com/github/ustalov/greeb.png"/>](https://codeclimate.com/github/ustalov/greeb)
|
169
|
+
|
134
170
|
## Copyright
|
135
171
|
|
136
172
|
Copyright (c) 2010-2013 [Dmitry Ustalov]. See LICENSE for details.
|
data/bin/greeb
CHANGED
data/greeb.gemspec
CHANGED
@@ -17,8 +17,6 @@ Gem::Specification.new do |s|
|
|
17
17
|
|
18
18
|
s.add_development_dependency 'rake'
|
19
19
|
s.add_development_dependency 'minitest', '>= 2.11'
|
20
|
-
s.add_development_dependency 'simplecov'
|
21
|
-
s.add_development_dependency 'yard'
|
22
20
|
|
23
21
|
s.files = `git ls-files`.split("\n")
|
24
22
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/lib/greeb/parser.rb
CHANGED
@@ -7,11 +7,11 @@
|
|
7
7
|
module Greeb::Parser
|
8
8
|
extend self
|
9
9
|
|
10
|
-
# URL pattern. Not so precise, but IDN-compatible.
|
11
|
-
URL =
|
10
|
+
# An URL pattern. Not so precise, but IDN-compatible.
|
11
|
+
URL = %r{\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|/)))}i
|
12
12
|
|
13
|
-
#
|
14
|
-
EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/
|
13
|
+
# A horrible e-mail pattern.
|
14
|
+
EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
|
15
15
|
|
16
16
|
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
17
17
|
# and this code should be rewritten to use the URI concept.
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -15,7 +15,7 @@ class Greeb::Segmentator
|
|
15
15
|
#
|
16
16
|
# @param tokens [Array<Greeb::Entity>] tokens from [Greeb::Tokenizer].
|
17
17
|
#
|
18
|
-
def initialize
|
18
|
+
def initialize(tokens)
|
19
19
|
@tokens = tokens
|
20
20
|
end
|
21
21
|
|
@@ -44,7 +44,7 @@ class Greeb::Segmentator
|
|
44
44
|
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
45
45
|
# sentences as keys and tokens arrays as values.
|
46
46
|
#
|
47
|
-
def extract
|
47
|
+
def extract(sentences)
|
48
48
|
Hash[
|
49
49
|
sentences.map do |s|
|
50
50
|
[s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
|
@@ -59,7 +59,7 @@ class Greeb::Segmentator
|
|
59
59
|
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
60
60
|
# sentences as keys and subsentences arrays as values.
|
61
61
|
#
|
62
|
-
def subextract
|
62
|
+
def subextract(sentences)
|
63
63
|
Hash[
|
64
64
|
sentences.map do |s|
|
65
65
|
[s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
|
@@ -88,8 +88,7 @@ class Greeb::Segmentator
|
|
88
88
|
if :punct == token.type
|
89
89
|
sentence.to = tokens.
|
90
90
|
select { |t| t.from >= token.from }.
|
91
|
-
inject(token) { |r, t| break r if t.type != token.type; t }.
|
92
|
-
to
|
91
|
+
inject(token) { |r, t| break r if t.type != token.type; t }.to
|
93
92
|
|
94
93
|
@sentences << sentence
|
95
94
|
sentence = new_sentence
|
@@ -100,7 +99,7 @@ class Greeb::Segmentator
|
|
100
99
|
sentence
|
101
100
|
end
|
102
101
|
|
103
|
-
nil.tap { @sentences << rest if rest.from
|
102
|
+
nil.tap { @sentences << rest if rest.from && rest.to }
|
104
103
|
end
|
105
104
|
|
106
105
|
# Implementation of the subsentence detection method. This method
|
@@ -112,19 +111,18 @@ class Greeb::Segmentator
|
|
112
111
|
@subsentences = SortedSet.new
|
113
112
|
|
114
113
|
rest = tokens.inject(new_subsentence) do |subsentence, token|
|
115
|
-
if !subsentence.from
|
114
|
+
if !subsentence.from && SENTENCE_DOESNT_START.include?(token.type)
|
116
115
|
next subsentence
|
117
116
|
end
|
118
117
|
|
119
118
|
subsentence.from = token.from unless subsentence.from
|
120
119
|
|
121
|
-
next subsentence if subsentence.to
|
120
|
+
next subsentence if subsentence.to && subsentence.to > token.to
|
122
121
|
|
123
122
|
if [:punct, :spunct].include? token.type
|
124
123
|
subsentence.to = tokens.
|
125
124
|
select { |t| t.from >= token.from }.
|
126
|
-
inject(token) { |r, t| break r if t.type != token.type; t }.
|
127
|
-
to
|
125
|
+
inject(token) { |r, t| break r if t.type != token.type; t }.to
|
128
126
|
|
129
127
|
@subsentences << subsentence
|
130
128
|
subsentence = new_subsentence
|
@@ -135,7 +133,7 @@ class Greeb::Segmentator
|
|
135
133
|
subsentence
|
136
134
|
end
|
137
135
|
|
138
|
-
nil.tap { @subsentences << rest if rest.from
|
136
|
+
nil.tap { @subsentences << rest if rest.from && rest.to }
|
139
137
|
end
|
140
138
|
|
141
139
|
private
|
data/lib/greeb/version.rb
CHANGED
data/spec/bin_spec.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe 'CLI' do
|
6
|
+
it 'should do nothing when ran without input' do
|
7
|
+
invoke('').must_be_empty
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should tokenize text when input is given' do
|
11
|
+
invoke(stdin: 'Hello guys!').must_equal(
|
12
|
+
%w(Hello guys !))
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should extract URLs' do
|
16
|
+
invoke(stdin: 'Hello http://nlpub.ru guys!').must_equal(
|
17
|
+
%w(Hello http://nlpub.ru guys !))
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should extract e-mails' do
|
21
|
+
invoke(stdin: 'Hello example@example.com guys!').must_equal(
|
22
|
+
%w(Hello example@example.com guys !))
|
23
|
+
end
|
24
|
+
end
|
data/spec/segmentator_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -2,13 +2,9 @@
|
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
if RUBY_VERSION == '1.8'
|
8
|
-
gem 'minitest'
|
9
|
-
end
|
10
|
-
|
5
|
+
gem 'minitest'
|
11
6
|
require 'minitest/autorun'
|
7
|
+
require 'minitest/hell'
|
12
8
|
|
13
9
|
unless 'true' == ENV['TRAVIS']
|
14
10
|
require 'simplecov'
|
@@ -17,4 +13,7 @@ unless 'true' == ENV['TRAVIS']
|
|
17
13
|
end
|
18
14
|
end
|
19
15
|
|
16
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
20
17
|
require 'greeb'
|
18
|
+
|
19
|
+
Dir[File.expand_path('../support/**/*.rb', __FILE__)].each { |f| require f }
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
# http://dota2.ru/guides/880-invokirkhakha-sanstrajk-ni-azhydal-da/
|
6
|
+
#
|
7
|
+
class MiniTest::Unit::TestCase
|
8
|
+
# Quas Wex Exort.
|
9
|
+
#
|
10
|
+
def invoke_cache
|
11
|
+
@invoke_cache ||= {}
|
12
|
+
end
|
13
|
+
|
14
|
+
# So begins a new age of knowledge.
|
15
|
+
#
|
16
|
+
def invoke(*argv)
|
17
|
+
return invoke_cache[argv] if invoke_cache.has_key? argv
|
18
|
+
|
19
|
+
arguments = argv.dup
|
20
|
+
options = (arguments.last.is_a? Hash) ? arguments.pop : {}
|
21
|
+
executable = File.expand_path('../../../bin/greeb', __FILE__)
|
22
|
+
|
23
|
+
Open3.popen3(executable, *arguments) do |i, o, *_|
|
24
|
+
i.puts options[:stdin] if options[:stdin]
|
25
|
+
i.close
|
26
|
+
invoke_cache[argv] = o.readlines.map(&:chomp!)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/spec/tokenizer_spec.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.
|
4
|
+
version: 0.2.0.pre3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-04-
|
11
|
+
date: 2013-04-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -38,34 +38,6 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '2.11'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: simplecov
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - '>='
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - '>='
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: yard
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - '>='
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
41
|
description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
|
70
42
|
written in Ruby.
|
71
43
|
email:
|
@@ -76,6 +48,7 @@ extensions: []
|
|
76
48
|
extra_rdoc_files: []
|
77
49
|
files:
|
78
50
|
- .gitignore
|
51
|
+
- .rubocop.yml
|
79
52
|
- .travis.yml
|
80
53
|
- .yardopts
|
81
54
|
- Gemfile
|
@@ -90,9 +63,11 @@ files:
|
|
90
63
|
- lib/greeb/strscan.rb
|
91
64
|
- lib/greeb/tokenizer.rb
|
92
65
|
- lib/greeb/version.rb
|
66
|
+
- spec/bin_spec.rb
|
93
67
|
- spec/parser_spec.rb
|
94
68
|
- spec/segmentator_spec.rb
|
95
69
|
- spec/spec_helper.rb
|
70
|
+
- spec/support/invoker.rb
|
96
71
|
- spec/tokenizer_spec.rb
|
97
72
|
homepage: https://github.com/ustalov/greeb
|
98
73
|
licenses: []
|
@@ -113,13 +88,15 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
113
88
|
version: 1.3.1
|
114
89
|
requirements: []
|
115
90
|
rubyforge_project: greeb
|
116
|
-
rubygems_version: 2.0.
|
91
|
+
rubygems_version: 2.0.0
|
117
92
|
signing_key:
|
118
93
|
specification_version: 4
|
119
94
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|
120
95
|
test_files:
|
96
|
+
- spec/bin_spec.rb
|
121
97
|
- spec/parser_spec.rb
|
122
98
|
- spec/segmentator_spec.rb
|
123
99
|
- spec/spec_helper.rb
|
100
|
+
- spec/support/invoker.rb
|
124
101
|
- spec/tokenizer_spec.rb
|
125
102
|
has_rdoc:
|