greeb 0.2.1 → 0.2.2.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d36a8ea4366f3bbc2b14b3f02dd7fadf0f25571e
4
- data.tar.gz: 42ecf5ddd8c4b8dac3ffa4394ce8c3211f91d918
3
+ metadata.gz: bff91e4559c7b7d1f83ef83212455d4b4b282351
4
+ data.tar.gz: 867bb10aeb676a2608b701dd57c5a68b215eeaa9
5
5
  SHA512:
6
- metadata.gz: 72282ee20c4566ccceac0a4f51aa3ad8ea213458502d2b5ed97c664c06fbc8aa8a28b5fb1f4e8bb84f0de2d7ea292d54d1500e646524b18c9af192069596e63e
7
- data.tar.gz: 1188dc13cac34f31d2536b94d460dfe5948f86095aaebd43c1a85da9f521ba53ae1ebdd94df6cf5702cb2f64df4d0cce37fee6f7a4d308feb230bc78729a79ce
6
+ metadata.gz: f116b3773ac59a02e17c6cece192cb5d8cc21ae976e2e78778752c602ef87b4e6cfe9c383e4b09150b11e9c4dbc08788b27a34834c15023d315b08d08e922d26
7
+ data.tar.gz: 2532a596959945be6793265fc062f9c89d7676df2a28a535e1662b8bf4d848ac66f35e3bf299282dc53e3ab72ce9efe76382079ada9476856da14361ce19df53
data/bin/greeb CHANGED
@@ -6,21 +6,8 @@ end
6
6
 
7
7
  require 'greeb'
8
8
 
9
- text = STDIN.read
10
- text.chomp!
9
+ text = STDIN.read.tap(&:chomp!)
11
10
 
12
- tokens = Greeb::Tokenizer.tokenize(text)
13
-
14
- extract = proc do |entity|
15
- from = tokens.index { |e| e.from == entity.from }
16
- to = tokens.index { |e| e.to == entity.to }
17
- tokens[from..to] = entity
18
- end
19
-
20
- Greeb::Parser.urls(text).each(&extract)
21
- Greeb::Parser.emails(text).each(&extract)
22
- Greeb::Parser.abbrevs(text).each(&extract)
23
-
24
- tokens.each do |entity|
11
+ Greeb[text].each do |entity|
25
12
  puts text[entity.from...entity.to] unless entity.type == :separ
26
13
  end
data/greeb.gemspec CHANGED
@@ -2,23 +2,23 @@
2
2
 
3
3
  require File.expand_path('../lib/greeb/version', __FILE__)
4
4
 
5
- Gem::Specification.new do |s|
6
- s.name = 'greeb'
7
- s.version = Greeb::VERSION
8
- s.platform = Gem::Platform::RUBY
9
- s.authors = ['Dmitry Ustalov']
10
- s.email = ['dmitry@eveel.ru']
11
- s.homepage = 'https://github.com/dmchk/greeb'
12
- s.summary = 'Greeb is a simple Unicode-aware regexp-based tokenizer.'
13
- s.description = 'Greeb is a simple yet awesome and Unicode-aware ' \
14
- 'regexp-based tokenizer, written in Ruby.'
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'greeb'
7
+ spec.version = Greeb::VERSION
8
+ spec.platform = Gem::Platform::RUBY
9
+ spec.authors = ['Dmitry Ustalov']
10
+ spec.email = ['dmitry@eveel.ru']
11
+ spec.homepage = 'https://github.com/dmchk/greeb'
12
+ spec.summary = 'Greeb is a simple Unicode-aware regexp-based tokenizer.'
13
+ spec.description = 'Greeb is a simple yet awesome and Unicode-aware ' \
14
+ 'regexp-based tokenizer, written in Ruby.'
15
15
 
16
- s.rubyforge_project = 'greeb'
16
+ spec.rubyforge_project = 'greeb'
17
17
 
18
- s.files = `git ls-files`.split("\n")
19
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
- s.require_paths = ['lib']
18
+ spec.files = `git ls-files`.split("\n")
19
+ spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ spec.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ spec.require_paths = ['lib']
22
22
 
23
- s.add_development_dependency 'minitest', '~> 5.0'
23
+ spec.add_development_dependency 'minitest', '~> 5.0'
24
24
  end
data/lib/greeb.rb CHANGED
@@ -44,3 +44,4 @@ require 'greeb/strscan'
44
44
  require 'greeb/tokenizer'
45
45
  require 'greeb/segmentator'
46
46
  require 'greeb/parser'
47
+ require 'greeb/core'
data/lib/greeb/core.rb ADDED
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+
3
+ # Greeb::Core is a simple tool that allows to invoke Greeb::Tokenizer and
4
+ # Greeb::Parser facilities together in a convinient and coherent way.
5
+ #
6
+ module Greeb::Core
7
+ # Greeb::Core uses several helpers from Greeb::Parser to perform
8
+ # additional analysis using there heuristic methods.
9
+ #
10
+ HELPERS = [:urls, :emails, :abbrevs]
11
+
12
+ # Recognize e-mail addresses in the input text.
13
+ #
14
+ # @param text [String] input text.
15
+ #
16
+ # @return [Array<Greeb::Entity>] a set of tokens.
17
+ #
18
+ def analyze text
19
+ Greeb::Tokenizer.tokenize(text).tap do |tokens|
20
+ HELPERS.each do |helper|
21
+ Greeb::Parser.public_send(helper, text).each do |parsed|
22
+ extract_tokens(tokens, parsed)
23
+ end
24
+ end
25
+ end
26
+ end
27
+
28
+ alias_method :'[]', :analyze
29
+
30
+ protected
31
+ # Extact tokens of the specified type from the input tokens set.
32
+ #
33
+ # @param tokens [Array<Greeb::Entity>] input tokens set.
34
+ # @param entity [Greeb::Entity] token to be extracted.
35
+ #
36
+ # @return [Greeb::Entity] token to be extracted.
37
+ #
38
+ def extract_tokens(tokens, entity)
39
+ from = tokens.index { |e| e.from == entity.from }
40
+ to = tokens.index { |e| e.to == entity.to }
41
+ tokens[from..to] = entity
42
+ end
43
+ end
44
+
45
+ Greeb.send(:extend, Greeb::Core)
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.1'
8
+ VERSION = '0.2.2.pre1'
9
9
  end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ module Greeb
6
+ describe Greeb do
7
+ it 'should do nothing when ran without input' do
8
+ Greeb[''].must_be_empty
9
+ end
10
+
11
+ it 'should tokenize text when input is given' do
12
+ Greeb['Hello guys!'].must_equal(
13
+ [Entity.new(0, 5, :letter),
14
+ Entity.new(5, 6, :separ),
15
+ Entity.new(6, 10, :letter),
16
+ Entity.new(10, 11, :punct)]
17
+ )
18
+ end
19
+
20
+ it 'should extract URLs' do
21
+ Greeb['Hello http://nlpub.ru guys!'].must_equal(
22
+ [Entity.new(0, 5, :letter),
23
+ Entity.new(5, 6, :separ),
24
+ Entity.new(6, 21, :url),
25
+ Entity.new(21, 22, :separ),
26
+ Entity.new(22, 26, :letter),
27
+ Entity.new(26, 27, :punct)]
28
+ )
29
+ end
30
+
31
+ it 'should extract e-mails' do
32
+ Greeb['Hello example@example.com guys!'].must_equal(
33
+ [Entity.new(0, 5, :letter),
34
+ Entity.new(5, 6, :separ),
35
+ Entity.new(6, 25, :email),
36
+ Entity.new(25, 26, :separ),
37
+ Entity.new(26, 30, :letter),
38
+ Entity.new(30, 31, :punct)]
39
+ )
40
+ end
41
+ end
42
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2.pre1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-02 00:00:00.000000000 Z
11
+ date: 2013-10-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -44,12 +44,14 @@ files:
44
44
  - bin/greeb
45
45
  - greeb.gemspec
46
46
  - lib/greeb.rb
47
+ - lib/greeb/core.rb
47
48
  - lib/greeb/parser.rb
48
49
  - lib/greeb/segmentator.rb
49
50
  - lib/greeb/strscan.rb
50
51
  - lib/greeb/tokenizer.rb
51
52
  - lib/greeb/version.rb
52
53
  - spec/bin_spec.rb
54
+ - spec/core_spec.rb
53
55
  - spec/parser_spec.rb
54
56
  - spec/segmentator_spec.rb
55
57
  - spec/spec_helper.rb
@@ -69,17 +71,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
69
71
  version: '0'
70
72
  required_rubygems_version: !ruby/object:Gem::Requirement
71
73
  requirements:
72
- - - '>='
74
+ - - '>'
73
75
  - !ruby/object:Gem::Version
74
- version: '0'
76
+ version: 1.3.1
75
77
  requirements: []
76
78
  rubyforge_project: greeb
77
- rubygems_version: 2.0.3
79
+ rubygems_version: 2.1.9
78
80
  signing_key:
79
81
  specification_version: 4
80
82
  summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
81
83
  test_files:
82
84
  - spec/bin_spec.rb
85
+ - spec/core_spec.rb
83
86
  - spec/parser_spec.rb
84
87
  - spec/segmentator_spec.rb
85
88
  - spec/spec_helper.rb