greeb 0.2.1 → 0.2.2.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/greeb +2 -15
- data/greeb.gemspec +16 -16
- data/lib/greeb.rb +1 -0
- data/lib/greeb/core.rb +45 -0
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +42 -0
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bff91e4559c7b7d1f83ef83212455d4b4b282351
|
4
|
+
data.tar.gz: 867bb10aeb676a2608b701dd57c5a68b215eeaa9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f116b3773ac59a02e17c6cece192cb5d8cc21ae976e2e78778752c602ef87b4e6cfe9c383e4b09150b11e9c4dbc08788b27a34834c15023d315b08d08e922d26
|
7
|
+
data.tar.gz: 2532a596959945be6793265fc062f9c89d7676df2a28a535e1662b8bf4d848ac66f35e3bf299282dc53e3ab72ce9efe76382079ada9476856da14361ce19df53
|
data/bin/greeb
CHANGED
@@ -6,21 +6,8 @@ end
|
|
6
6
|
|
7
7
|
require 'greeb'
|
8
8
|
|
9
|
-
text = STDIN.read
|
10
|
-
text.chomp!
|
9
|
+
text = STDIN.read.tap(&:chomp!)
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
extract = proc do |entity|
|
15
|
-
from = tokens.index { |e| e.from == entity.from }
|
16
|
-
to = tokens.index { |e| e.to == entity.to }
|
17
|
-
tokens[from..to] = entity
|
18
|
-
end
|
19
|
-
|
20
|
-
Greeb::Parser.urls(text).each(&extract)
|
21
|
-
Greeb::Parser.emails(text).each(&extract)
|
22
|
-
Greeb::Parser.abbrevs(text).each(&extract)
|
23
|
-
|
24
|
-
tokens.each do |entity|
|
11
|
+
Greeb[text].each do |entity|
|
25
12
|
puts text[entity.from...entity.to] unless entity.type == :separ
|
26
13
|
end
|
data/greeb.gemspec
CHANGED
@@ -2,23 +2,23 @@
|
|
2
2
|
|
3
3
|
require File.expand_path('../lib/greeb/version', __FILE__)
|
4
4
|
|
5
|
-
Gem::Specification.new do |
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'greeb'
|
7
|
+
spec.version = Greeb::VERSION
|
8
|
+
spec.platform = Gem::Platform::RUBY
|
9
|
+
spec.authors = ['Dmitry Ustalov']
|
10
|
+
spec.email = ['dmitry@eveel.ru']
|
11
|
+
spec.homepage = 'https://github.com/dmchk/greeb'
|
12
|
+
spec.summary = 'Greeb is a simple Unicode-aware regexp-based tokenizer.'
|
13
|
+
spec.description = 'Greeb is a simple yet awesome and Unicode-aware ' \
|
14
|
+
'regexp-based tokenizer, written in Ruby.'
|
15
15
|
|
16
|
-
|
16
|
+
spec.rubyforge_project = 'greeb'
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
spec.files = `git ls-files`.split("\n")
|
19
|
+
spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
spec.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
spec.require_paths = ['lib']
|
22
22
|
|
23
|
-
|
23
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
24
24
|
end
|
data/lib/greeb.rb
CHANGED
data/lib/greeb/core.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Greeb::Core is a simple tool that allows to invoke Greeb::Tokenizer and
|
4
|
+
# Greeb::Parser facilities together in a convinient and coherent way.
|
5
|
+
#
|
6
|
+
module Greeb::Core
|
7
|
+
# Greeb::Core uses several helpers from Greeb::Parser to perform
|
8
|
+
# additional analysis using there heuristic methods.
|
9
|
+
#
|
10
|
+
HELPERS = [:urls, :emails, :abbrevs]
|
11
|
+
|
12
|
+
# Recognize e-mail addresses in the input text.
|
13
|
+
#
|
14
|
+
# @param text [String] input text.
|
15
|
+
#
|
16
|
+
# @return [Array<Greeb::Entity>] a set of tokens.
|
17
|
+
#
|
18
|
+
def analyze text
|
19
|
+
Greeb::Tokenizer.tokenize(text).tap do |tokens|
|
20
|
+
HELPERS.each do |helper|
|
21
|
+
Greeb::Parser.public_send(helper, text).each do |parsed|
|
22
|
+
extract_tokens(tokens, parsed)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
alias_method :'[]', :analyze
|
29
|
+
|
30
|
+
protected
|
31
|
+
# Extact tokens of the specified type from the input tokens set.
|
32
|
+
#
|
33
|
+
# @param tokens [Array<Greeb::Entity>] input tokens set.
|
34
|
+
# @param entity [Greeb::Entity] token to be extracted.
|
35
|
+
#
|
36
|
+
# @return [Greeb::Entity] token to be extracted.
|
37
|
+
#
|
38
|
+
def extract_tokens(tokens, entity)
|
39
|
+
from = tokens.index { |e| e.from == entity.from }
|
40
|
+
to = tokens.index { |e| e.to == entity.to }
|
41
|
+
tokens[from..to] = entity
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
Greeb.send(:extend, Greeb::Core)
|
data/lib/greeb/version.rb
CHANGED
data/spec/core_spec.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
module Greeb
|
6
|
+
describe Greeb do
|
7
|
+
it 'should do nothing when ran without input' do
|
8
|
+
Greeb[''].must_be_empty
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should tokenize text when input is given' do
|
12
|
+
Greeb['Hello guys!'].must_equal(
|
13
|
+
[Entity.new(0, 5, :letter),
|
14
|
+
Entity.new(5, 6, :separ),
|
15
|
+
Entity.new(6, 10, :letter),
|
16
|
+
Entity.new(10, 11, :punct)]
|
17
|
+
)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should extract URLs' do
|
21
|
+
Greeb['Hello http://nlpub.ru guys!'].must_equal(
|
22
|
+
[Entity.new(0, 5, :letter),
|
23
|
+
Entity.new(5, 6, :separ),
|
24
|
+
Entity.new(6, 21, :url),
|
25
|
+
Entity.new(21, 22, :separ),
|
26
|
+
Entity.new(22, 26, :letter),
|
27
|
+
Entity.new(26, 27, :punct)]
|
28
|
+
)
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should extract e-mails' do
|
32
|
+
Greeb['Hello example@example.com guys!'].must_equal(
|
33
|
+
[Entity.new(0, 5, :letter),
|
34
|
+
Entity.new(5, 6, :separ),
|
35
|
+
Entity.new(6, 25, :email),
|
36
|
+
Entity.new(25, 26, :separ),
|
37
|
+
Entity.new(26, 30, :letter),
|
38
|
+
Entity.new(30, 31, :punct)]
|
39
|
+
)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2.pre1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: minitest
|
@@ -44,12 +44,14 @@ files:
|
|
44
44
|
- bin/greeb
|
45
45
|
- greeb.gemspec
|
46
46
|
- lib/greeb.rb
|
47
|
+
- lib/greeb/core.rb
|
47
48
|
- lib/greeb/parser.rb
|
48
49
|
- lib/greeb/segmentator.rb
|
49
50
|
- lib/greeb/strscan.rb
|
50
51
|
- lib/greeb/tokenizer.rb
|
51
52
|
- lib/greeb/version.rb
|
52
53
|
- spec/bin_spec.rb
|
54
|
+
- spec/core_spec.rb
|
53
55
|
- spec/parser_spec.rb
|
54
56
|
- spec/segmentator_spec.rb
|
55
57
|
- spec/spec_helper.rb
|
@@ -69,17 +71,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
69
71
|
version: '0'
|
70
72
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
73
|
requirements:
|
72
|
-
- - '
|
74
|
+
- - '>'
|
73
75
|
- !ruby/object:Gem::Version
|
74
|
-
version:
|
76
|
+
version: 1.3.1
|
75
77
|
requirements: []
|
76
78
|
rubyforge_project: greeb
|
77
|
-
rubygems_version: 2.
|
79
|
+
rubygems_version: 2.1.9
|
78
80
|
signing_key:
|
79
81
|
specification_version: 4
|
80
82
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|
81
83
|
test_files:
|
82
84
|
- spec/bin_spec.rb
|
85
|
+
- spec/core_spec.rb
|
83
86
|
- spec/parser_spec.rb
|
84
87
|
- spec/segmentator_spec.rb
|
85
88
|
- spec/spec_helper.rb
|