greeb 0.2.1 → 0.2.2.pre1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d36a8ea4366f3bbc2b14b3f02dd7fadf0f25571e
4
- data.tar.gz: 42ecf5ddd8c4b8dac3ffa4394ce8c3211f91d918
3
+ metadata.gz: bff91e4559c7b7d1f83ef83212455d4b4b282351
4
+ data.tar.gz: 867bb10aeb676a2608b701dd57c5a68b215eeaa9
5
5
  SHA512:
6
- metadata.gz: 72282ee20c4566ccceac0a4f51aa3ad8ea213458502d2b5ed97c664c06fbc8aa8a28b5fb1f4e8bb84f0de2d7ea292d54d1500e646524b18c9af192069596e63e
7
- data.tar.gz: 1188dc13cac34f31d2536b94d460dfe5948f86095aaebd43c1a85da9f521ba53ae1ebdd94df6cf5702cb2f64df4d0cce37fee6f7a4d308feb230bc78729a79ce
6
+ metadata.gz: f116b3773ac59a02e17c6cece192cb5d8cc21ae976e2e78778752c602ef87b4e6cfe9c383e4b09150b11e9c4dbc08788b27a34834c15023d315b08d08e922d26
7
+ data.tar.gz: 2532a596959945be6793265fc062f9c89d7676df2a28a535e1662b8bf4d848ac66f35e3bf299282dc53e3ab72ce9efe76382079ada9476856da14361ce19df53
data/bin/greeb CHANGED
@@ -6,21 +6,8 @@ end
6
6
 
7
7
  require 'greeb'
8
8
 
9
- text = STDIN.read
10
- text.chomp!
9
+ text = STDIN.read.tap(&:chomp!)
11
10
 
12
- tokens = Greeb::Tokenizer.tokenize(text)
13
-
14
- extract = proc do |entity|
15
- from = tokens.index { |e| e.from == entity.from }
16
- to = tokens.index { |e| e.to == entity.to }
17
- tokens[from..to] = entity
18
- end
19
-
20
- Greeb::Parser.urls(text).each(&extract)
21
- Greeb::Parser.emails(text).each(&extract)
22
- Greeb::Parser.abbrevs(text).each(&extract)
23
-
24
- tokens.each do |entity|
11
+ Greeb[text].each do |entity|
25
12
  puts text[entity.from...entity.to] unless entity.type == :separ
26
13
  end
data/greeb.gemspec CHANGED
@@ -2,23 +2,23 @@
2
2
 
3
3
  require File.expand_path('../lib/greeb/version', __FILE__)
4
4
 
5
- Gem::Specification.new do |s|
6
- s.name = 'greeb'
7
- s.version = Greeb::VERSION
8
- s.platform = Gem::Platform::RUBY
9
- s.authors = ['Dmitry Ustalov']
10
- s.email = ['dmitry@eveel.ru']
11
- s.homepage = 'https://github.com/dmchk/greeb'
12
- s.summary = 'Greeb is a simple Unicode-aware regexp-based tokenizer.'
13
- s.description = 'Greeb is a simple yet awesome and Unicode-aware ' \
14
- 'regexp-based tokenizer, written in Ruby.'
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'greeb'
7
+ spec.version = Greeb::VERSION
8
+ spec.platform = Gem::Platform::RUBY
9
+ spec.authors = ['Dmitry Ustalov']
10
+ spec.email = ['dmitry@eveel.ru']
11
+ spec.homepage = 'https://github.com/dmchk/greeb'
12
+ spec.summary = 'Greeb is a simple Unicode-aware regexp-based tokenizer.'
13
+ spec.description = 'Greeb is a simple yet awesome and Unicode-aware ' \
14
+ 'regexp-based tokenizer, written in Ruby.'
15
15
 
16
- s.rubyforge_project = 'greeb'
16
+ spec.rubyforge_project = 'greeb'
17
17
 
18
- s.files = `git ls-files`.split("\n")
19
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
- s.require_paths = ['lib']
18
+ spec.files = `git ls-files`.split("\n")
19
+ spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ spec.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ spec.require_paths = ['lib']
22
22
 
23
- s.add_development_dependency 'minitest', '~> 5.0'
23
+ spec.add_development_dependency 'minitest', '~> 5.0'
24
24
  end
data/lib/greeb.rb CHANGED
@@ -44,3 +44,4 @@ require 'greeb/strscan'
44
44
  require 'greeb/tokenizer'
45
45
  require 'greeb/segmentator'
46
46
  require 'greeb/parser'
47
+ require 'greeb/core'
data/lib/greeb/core.rb ADDED
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+
3
+ # Greeb::Core is a simple tool that allows to invoke Greeb::Tokenizer and
4
+ # Greeb::Parser facilities together in a convinient and coherent way.
5
+ #
6
+ module Greeb::Core
7
+ # Greeb::Core uses several helpers from Greeb::Parser to perform
8
+ # additional analysis using there heuristic methods.
9
+ #
10
+ HELPERS = [:urls, :emails, :abbrevs]
11
+
12
+ # Recognize e-mail addresses in the input text.
13
+ #
14
+ # @param text [String] input text.
15
+ #
16
+ # @return [Array<Greeb::Entity>] a set of tokens.
17
+ #
18
+ def analyze text
19
+ Greeb::Tokenizer.tokenize(text).tap do |tokens|
20
+ HELPERS.each do |helper|
21
+ Greeb::Parser.public_send(helper, text).each do |parsed|
22
+ extract_tokens(tokens, parsed)
23
+ end
24
+ end
25
+ end
26
+ end
27
+
28
+ alias_method :'[]', :analyze
29
+
30
+ protected
31
+ # Extact tokens of the specified type from the input tokens set.
32
+ #
33
+ # @param tokens [Array<Greeb::Entity>] input tokens set.
34
+ # @param entity [Greeb::Entity] token to be extracted.
35
+ #
36
+ # @return [Greeb::Entity] token to be extracted.
37
+ #
38
+ def extract_tokens(tokens, entity)
39
+ from = tokens.index { |e| e.from == entity.from }
40
+ to = tokens.index { |e| e.to == entity.to }
41
+ tokens[from..to] = entity
42
+ end
43
+ end
44
+
45
+ Greeb.send(:extend, Greeb::Core)
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.1'
8
+ VERSION = '0.2.2.pre1'
9
9
  end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ module Greeb
6
+ describe Greeb do
7
+ it 'should do nothing when ran without input' do
8
+ Greeb[''].must_be_empty
9
+ end
10
+
11
+ it 'should tokenize text when input is given' do
12
+ Greeb['Hello guys!'].must_equal(
13
+ [Entity.new(0, 5, :letter),
14
+ Entity.new(5, 6, :separ),
15
+ Entity.new(6, 10, :letter),
16
+ Entity.new(10, 11, :punct)]
17
+ )
18
+ end
19
+
20
+ it 'should extract URLs' do
21
+ Greeb['Hello http://nlpub.ru guys!'].must_equal(
22
+ [Entity.new(0, 5, :letter),
23
+ Entity.new(5, 6, :separ),
24
+ Entity.new(6, 21, :url),
25
+ Entity.new(21, 22, :separ),
26
+ Entity.new(22, 26, :letter),
27
+ Entity.new(26, 27, :punct)]
28
+ )
29
+ end
30
+
31
+ it 'should extract e-mails' do
32
+ Greeb['Hello example@example.com guys!'].must_equal(
33
+ [Entity.new(0, 5, :letter),
34
+ Entity.new(5, 6, :separ),
35
+ Entity.new(6, 25, :email),
36
+ Entity.new(25, 26, :separ),
37
+ Entity.new(26, 30, :letter),
38
+ Entity.new(30, 31, :punct)]
39
+ )
40
+ end
41
+ end
42
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2.pre1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-02 00:00:00.000000000 Z
11
+ date: 2013-10-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -44,12 +44,14 @@ files:
44
44
  - bin/greeb
45
45
  - greeb.gemspec
46
46
  - lib/greeb.rb
47
+ - lib/greeb/core.rb
47
48
  - lib/greeb/parser.rb
48
49
  - lib/greeb/segmentator.rb
49
50
  - lib/greeb/strscan.rb
50
51
  - lib/greeb/tokenizer.rb
51
52
  - lib/greeb/version.rb
52
53
  - spec/bin_spec.rb
54
+ - spec/core_spec.rb
53
55
  - spec/parser_spec.rb
54
56
  - spec/segmentator_spec.rb
55
57
  - spec/spec_helper.rb
@@ -69,17 +71,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
69
71
  version: '0'
70
72
  required_rubygems_version: !ruby/object:Gem::Requirement
71
73
  requirements:
72
- - - '>='
74
+ - - '>'
73
75
  - !ruby/object:Gem::Version
74
- version: '0'
76
+ version: 1.3.1
75
77
  requirements: []
76
78
  rubyforge_project: greeb
77
- rubygems_version: 2.0.3
79
+ rubygems_version: 2.1.9
78
80
  signing_key:
79
81
  specification_version: 4
80
82
  summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
81
83
  test_files:
82
84
  - spec/bin_spec.rb
85
+ - spec/core_spec.rb
83
86
  - spec/parser_spec.rb
84
87
  - spec/segmentator_spec.rb
85
88
  - spec/spec_helper.rb