greeb 0.2.0.rc1 → 0.2.0.rc2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
4
- data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
3
+ metadata.gz: 78e6a2b607a8690b2fe171665e35272efd31b2ac
4
+ data.tar.gz: 4517bb06cc8e1f8b0be5fc47bca4bfeda0fcfd49
5
5
  SHA512:
6
- metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
7
- data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973
6
+ metadata.gz: dd24765af042566249e0c8d3153aee640c626b8f838fdda03c7b6598ad30bd0935df47ad1d1c6658ccf0e4fd598a94052c99dee127becb2f9c59b3e8dafe1cf0
7
+ data.tar.gz: 9eca1a25e8837732827a282d3d532db35113620372a0bde41b64669cb3fd696cc4c149267e19a5cfe58df7367b2932942554ea88cfd26826c73db926ad6ca89d
data/README.md CHANGED
@@ -129,6 +129,7 @@ pp segmentator.extract(segmentator.sentences)
129
129
  Texts are often include some special entities such as URLs and e-mail
130
130
  addresses. Greeb can help you in these strings retrieval.
131
131
 
132
+ #### URL and E-mail retrieval
132
133
  ```ruby
133
134
  text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
134
135
 
@@ -145,6 +146,19 @@ pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
145
146
 
146
147
  Please don't use Greeb in spam lists development purposes.
147
148
 
149
+ #### Abbreviation retrieval
150
+ ```ruby
151
+ text = 'Hello, G.L.H.F. everyone!'
152
+
153
+ pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
154
+ =begin
155
+ [[#<struct Greeb::Entity from=7, to=15, type=:abbrev>, "G.L.H.F."]]
156
+ =end
157
+ ```
158
+
159
+ The algorithm is not so accurate, but still useful in many practical
160
+ situations.
161
+
148
162
  ## Tokens
149
163
  Greeb operates with entities, tuples of *(from, to, kind)*, where
150
164
  *from* is a beginning of the entity, *to* is an ending of the entity,
data/greeb.gemspec CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
16
16
  s.rubyforge_project = 'greeb'
17
17
 
18
18
  s.add_development_dependency 'rake'
19
- s.add_development_dependency 'minitest', '>= 2.11'
19
+ s.add_development_dependency 'minitest', '~> 5.0'
20
20
 
21
21
  s.files = `git ls-files`.split("\n")
22
22
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
data/lib/greeb/parser.rb CHANGED
@@ -13,6 +13,9 @@ module Greeb::Parser
13
13
  # A horrible e-mail pattern.
14
14
  EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
15
15
 
16
+ # Another horrible pattern. Now for abbreviations.
17
+ ABBREV = /\b(\p{L}\.)+/i
18
+
16
19
  # Recognize URLs in the input text. Actually, URL is obsolete standard
17
20
  # and this code should be rewritten to use the URI concept.
18
21
  #
@@ -34,6 +37,16 @@ module Greeb::Parser
34
37
  scan(text, EMAIL, :email)
35
38
  end
36
39
 
40
+ # Recognize abbreviations in the input text.
41
+ #
42
+ # @param text [String] input text.
43
+ #
44
+ # @return [Array<Greeb::Entity>] found abbreviations.
45
+ #
46
+ def abbrevs(text)
47
+ scan(text, ABBREV, :abbrev)
48
+ end
49
+
37
50
  private
38
51
  # Implementation of regexp-based {Greeb::Entity} scanner.
39
52
  #
@@ -57,6 +57,20 @@ module Greeb::Tokenizer
57
57
  scanner.terminate
58
58
  end
59
59
 
60
+ # Split one line into characters array, but also combine duplicated
61
+ # characters.
62
+ #
63
+ # For instance, `"a b\n\n\nc"` would be transformed into the following
64
+ # array: `["a", " ", "b", "\n\n\n", "c"]`.
65
+ #
66
+ # @param token [String] a token to be splitted.
67
+ #
68
+ # @return [Array<String>] splitted characters.
69
+ #
70
+ def split(token)
71
+ token.scan(/((.|\n)\2*)/).map(&:first)
72
+ end
73
+
60
74
  protected
61
75
  # One iteration of the tokenization process.
62
76
  #
@@ -115,18 +129,4 @@ module Greeb::Tokenizer
115
129
  before + s.length
116
130
  end
117
131
  end
118
-
119
- # Split one line into characters array, but also combine line breaks
120
- # into single elements.
121
- #
122
- # For instance, `"a b\n\n\nc"` would be transformed into the following
123
- # array: `["a", " ", "b", "\n\n\n", "c"]`.
124
- #
125
- # @param token [String] a token to be splitted.
126
- #
127
- # @return [Array<String>] splitted characters.
128
- #
129
- def split(token)
130
- token.scan(/((.|\n)\2*)/).map(&:first)
131
- end
132
132
  end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.0.rc1'
8
+ VERSION = '0.2.0.rc2'
9
9
  end
data/spec/parser_spec.rb CHANGED
@@ -5,9 +5,9 @@ require_relative 'spec_helper'
5
5
  module Greeb
6
6
  describe Parser do
7
7
  let(:text) do
8
- 'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
8
+ 'Hello there! My name is Vasya B. My website is: http://вася.рф/. ' \
9
9
  'And my e-mail is example@example.com! Also it is available by ' \
10
- 'URL: http://vasya.ru.'
10
+ 'URL: http://vasya.ru. Also, G.L.H.F. everyone!'
11
11
  end
12
12
 
13
13
  describe 'URL' do
@@ -15,8 +15,8 @@ module Greeb
15
15
 
16
16
  it 'recognizes URLs' do
17
17
  subject.must_equal(
18
- [Entity.new(46, 61, :url),
19
- Entity.new(130, 145, :url)]
18
+ [Entity.new(48, 63, :url),
19
+ Entity.new(132, 147, :url)]
20
20
  )
21
21
  end
22
22
  end
@@ -26,7 +26,18 @@ module Greeb
26
26
 
27
27
  it 'recognizes e-mails' do
28
28
  subject.must_equal(
29
- [Entity.new(80, 99, :email)]
29
+ [Entity.new(82, 101, :email)]
30
+ )
31
+ end
32
+ end
33
+
34
+ describe 'ABBREV' do
35
+ subject { Parser.abbrevs(text) }
36
+
37
+ it 'recognizes abbreviations' do
38
+ subject.must_equal(
39
+ [Entity.new(30, 32, :abbrev),
40
+ Entity.new(155, 163, :abbrev)]
30
41
  )
31
42
  end
32
43
  end
@@ -4,7 +4,7 @@ require 'open3'
4
4
 
5
5
  # http://dota2.ru/guides/880-invokirkhakha-sanstrajk-ni-azhydal-da/
6
6
  #
7
- class MiniTest::Unit::TestCase
7
+ class MiniTest::Test
8
8
  # Quas Wex Exort.
9
9
  #
10
10
  def invoke_cache
@@ -79,5 +79,20 @@ module Greeb
79
79
  )
80
80
  end
81
81
  end
82
+
83
+ describe '.split' do
84
+ it 'should split characters' do
85
+ Tokenizer.split('loh').must_equal %w(l o h)
86
+ end
87
+
88
+ it 'should combine duplicated characters' do
89
+ Tokenizer.split('foo').must_equal %w(f oo)
90
+ end
91
+
92
+ it 'should also deal with line breaks' do
93
+ Tokenizer.split("bar\n\nbaz").must_equal(
94
+ [*%w(b a r), "\n\n", *%w(b a z)])
95
+ end
96
+ end
82
97
  end
83
98
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.rc1
4
+ version: 0.2.0.rc2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-05-05 00:00:00.000000000 Z
11
+ date: 2013-05-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -28,16 +28,16 @@ dependencies:
28
28
  name: minitest
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
- version: '2.11'
33
+ version: '5.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
- version: '2.11'
40
+ version: '5.0'
41
41
  description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
42
42
  written in Ruby.
43
43
  email: