greeb 0.2.1 → 0.2.2.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/greeb +2 -15
- data/greeb.gemspec +16 -16
- data/lib/greeb.rb +1 -0
- data/lib/greeb/core.rb +45 -0
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +42 -0
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bff91e4559c7b7d1f83ef83212455d4b4b282351
|
4
|
+
data.tar.gz: 867bb10aeb676a2608b701dd57c5a68b215eeaa9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f116b3773ac59a02e17c6cece192cb5d8cc21ae976e2e78778752c602ef87b4e6cfe9c383e4b09150b11e9c4dbc08788b27a34834c15023d315b08d08e922d26
|
7
|
+
data.tar.gz: 2532a596959945be6793265fc062f9c89d7676df2a28a535e1662b8bf4d848ac66f35e3bf299282dc53e3ab72ce9efe76382079ada9476856da14361ce19df53
|
data/bin/greeb
CHANGED
@@ -6,21 +6,8 @@ end
|
|
6
6
|
|
7
7
|
require 'greeb'
|
8
8
|
|
9
|
-
text = STDIN.read
|
10
|
-
text.chomp!
|
9
|
+
text = STDIN.read.tap(&:chomp!)
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
extract = proc do |entity|
|
15
|
-
from = tokens.index { |e| e.from == entity.from }
|
16
|
-
to = tokens.index { |e| e.to == entity.to }
|
17
|
-
tokens[from..to] = entity
|
18
|
-
end
|
19
|
-
|
20
|
-
Greeb::Parser.urls(text).each(&extract)
|
21
|
-
Greeb::Parser.emails(text).each(&extract)
|
22
|
-
Greeb::Parser.abbrevs(text).each(&extract)
|
23
|
-
|
24
|
-
tokens.each do |entity|
|
11
|
+
Greeb[text].each do |entity|
|
25
12
|
puts text[entity.from...entity.to] unless entity.type == :separ
|
26
13
|
end
|
data/greeb.gemspec
CHANGED
@@ -2,23 +2,23 @@
|
|
2
2
|
|
3
3
|
require File.expand_path('../lib/greeb/version', __FILE__)
|
4
4
|
|
5
|
-
Gem::Specification.new do |
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'greeb'
|
7
|
+
spec.version = Greeb::VERSION
|
8
|
+
spec.platform = Gem::Platform::RUBY
|
9
|
+
spec.authors = ['Dmitry Ustalov']
|
10
|
+
spec.email = ['dmitry@eveel.ru']
|
11
|
+
spec.homepage = 'https://github.com/dmchk/greeb'
|
12
|
+
spec.summary = 'Greeb is a simple Unicode-aware regexp-based tokenizer.'
|
13
|
+
spec.description = 'Greeb is a simple yet awesome and Unicode-aware ' \
|
14
|
+
'regexp-based tokenizer, written in Ruby.'
|
15
15
|
|
16
|
-
|
16
|
+
spec.rubyforge_project = 'greeb'
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
spec.files = `git ls-files`.split("\n")
|
19
|
+
spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
spec.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
spec.require_paths = ['lib']
|
22
22
|
|
23
|
-
|
23
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
24
24
|
end
|
data/lib/greeb.rb
CHANGED
data/lib/greeb/core.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Greeb::Core is a simple tool that allows to invoke Greeb::Tokenizer and
|
4
|
+
# Greeb::Parser facilities together in a convinient and coherent way.
|
5
|
+
#
|
6
|
+
module Greeb::Core
|
7
|
+
# Greeb::Core uses several helpers from Greeb::Parser to perform
|
8
|
+
# additional analysis using there heuristic methods.
|
9
|
+
#
|
10
|
+
HELPERS = [:urls, :emails, :abbrevs]
|
11
|
+
|
12
|
+
# Recognize e-mail addresses in the input text.
|
13
|
+
#
|
14
|
+
# @param text [String] input text.
|
15
|
+
#
|
16
|
+
# @return [Array<Greeb::Entity>] a set of tokens.
|
17
|
+
#
|
18
|
+
def analyze text
|
19
|
+
Greeb::Tokenizer.tokenize(text).tap do |tokens|
|
20
|
+
HELPERS.each do |helper|
|
21
|
+
Greeb::Parser.public_send(helper, text).each do |parsed|
|
22
|
+
extract_tokens(tokens, parsed)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
alias_method :'[]', :analyze
|
29
|
+
|
30
|
+
protected
|
31
|
+
# Extact tokens of the specified type from the input tokens set.
|
32
|
+
#
|
33
|
+
# @param tokens [Array<Greeb::Entity>] input tokens set.
|
34
|
+
# @param entity [Greeb::Entity] token to be extracted.
|
35
|
+
#
|
36
|
+
# @return [Greeb::Entity] token to be extracted.
|
37
|
+
#
|
38
|
+
def extract_tokens(tokens, entity)
|
39
|
+
from = tokens.index { |e| e.from == entity.from }
|
40
|
+
to = tokens.index { |e| e.to == entity.to }
|
41
|
+
tokens[from..to] = entity
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
Greeb.send(:extend, Greeb::Core)
|
data/lib/greeb/version.rb
CHANGED
data/spec/core_spec.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
module Greeb
|
6
|
+
describe Greeb do
|
7
|
+
it 'should do nothing when ran without input' do
|
8
|
+
Greeb[''].must_be_empty
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should tokenize text when input is given' do
|
12
|
+
Greeb['Hello guys!'].must_equal(
|
13
|
+
[Entity.new(0, 5, :letter),
|
14
|
+
Entity.new(5, 6, :separ),
|
15
|
+
Entity.new(6, 10, :letter),
|
16
|
+
Entity.new(10, 11, :punct)]
|
17
|
+
)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should extract URLs' do
|
21
|
+
Greeb['Hello http://nlpub.ru guys!'].must_equal(
|
22
|
+
[Entity.new(0, 5, :letter),
|
23
|
+
Entity.new(5, 6, :separ),
|
24
|
+
Entity.new(6, 21, :url),
|
25
|
+
Entity.new(21, 22, :separ),
|
26
|
+
Entity.new(22, 26, :letter),
|
27
|
+
Entity.new(26, 27, :punct)]
|
28
|
+
)
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should extract e-mails' do
|
32
|
+
Greeb['Hello example@example.com guys!'].must_equal(
|
33
|
+
[Entity.new(0, 5, :letter),
|
34
|
+
Entity.new(5, 6, :separ),
|
35
|
+
Entity.new(6, 25, :email),
|
36
|
+
Entity.new(25, 26, :separ),
|
37
|
+
Entity.new(26, 30, :letter),
|
38
|
+
Entity.new(30, 31, :punct)]
|
39
|
+
)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2.pre1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: minitest
|
@@ -44,12 +44,14 @@ files:
|
|
44
44
|
- bin/greeb
|
45
45
|
- greeb.gemspec
|
46
46
|
- lib/greeb.rb
|
47
|
+
- lib/greeb/core.rb
|
47
48
|
- lib/greeb/parser.rb
|
48
49
|
- lib/greeb/segmentator.rb
|
49
50
|
- lib/greeb/strscan.rb
|
50
51
|
- lib/greeb/tokenizer.rb
|
51
52
|
- lib/greeb/version.rb
|
52
53
|
- spec/bin_spec.rb
|
54
|
+
- spec/core_spec.rb
|
53
55
|
- spec/parser_spec.rb
|
54
56
|
- spec/segmentator_spec.rb
|
55
57
|
- spec/spec_helper.rb
|
@@ -69,17 +71,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
69
71
|
version: '0'
|
70
72
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
73
|
requirements:
|
72
|
-
- - '
|
74
|
+
- - '>'
|
73
75
|
- !ruby/object:Gem::Version
|
74
|
-
version:
|
76
|
+
version: 1.3.1
|
75
77
|
requirements: []
|
76
78
|
rubyforge_project: greeb
|
77
|
-
rubygems_version: 2.
|
79
|
+
rubygems_version: 2.1.9
|
78
80
|
signing_key:
|
79
81
|
specification_version: 4
|
80
82
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|
81
83
|
test_files:
|
82
84
|
- spec/bin_spec.rb
|
85
|
+
- spec/core_spec.rb
|
83
86
|
- spec/parser_spec.rb
|
84
87
|
- spec/segmentator_spec.rb
|
85
88
|
- spec/spec_helper.rb
|