greeb 0.2.0.rc1 → 0.2.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
4
- data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
3
+ metadata.gz: 78e6a2b607a8690b2fe171665e35272efd31b2ac
4
+ data.tar.gz: 4517bb06cc8e1f8b0be5fc47bca4bfeda0fcfd49
5
5
  SHA512:
6
- metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
7
- data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973
6
+ metadata.gz: dd24765af042566249e0c8d3153aee640c626b8f838fdda03c7b6598ad30bd0935df47ad1d1c6658ccf0e4fd598a94052c99dee127becb2f9c59b3e8dafe1cf0
7
+ data.tar.gz: 9eca1a25e8837732827a282d3d532db35113620372a0bde41b64669cb3fd696cc4c149267e19a5cfe58df7367b2932942554ea88cfd26826c73db926ad6ca89d
data/README.md CHANGED
@@ -129,6 +129,7 @@ pp segmentator.extract(segmentator.sentences)
129
129
  Texts are often include some special entities such as URLs and e-mail
130
130
  addresses. Greeb can help you in these strings retrieval.
131
131
 
132
+ #### URL and E-mail retrieval
132
133
  ```ruby
133
134
  text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
134
135
 
@@ -145,6 +146,19 @@ pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
145
146
 
146
147
  Please don't use Greeb in spam lists development purposes.
147
148
 
149
+ #### Abbreviation retrieval
150
+ ```ruby
151
+ text = 'Hello, G.L.H.F. everyone!'
152
+
153
+ pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
154
+ =begin
155
+ [[#<struct Greeb::Entity from=7, to=15, type=:abbrev>, "G.L.H.F."]]
156
+ =end
157
+ ```
158
+
159
+ The algorithm is not so accurate, but still useful in many practical
160
+ situations.
161
+
148
162
  ## Tokens
149
163
  Greeb operates with entities, tuples of *(from, to, kind)*, where
150
164
  *from* is a beginning of the entity, *to* is an ending of the entity,
data/greeb.gemspec CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
16
16
  s.rubyforge_project = 'greeb'
17
17
 
18
18
  s.add_development_dependency 'rake'
19
- s.add_development_dependency 'minitest', '>= 2.11'
19
+ s.add_development_dependency 'minitest', '~> 5.0'
20
20
 
21
21
  s.files = `git ls-files`.split("\n")
22
22
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
data/lib/greeb/parser.rb CHANGED
@@ -13,6 +13,9 @@ module Greeb::Parser
13
13
  # A horrible e-mail pattern.
14
14
  EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
15
15
 
16
+ # Another horrible pattern. Now for abbreviations.
17
+ ABBREV = /\b(\p{L}\.)+/i
18
+
16
19
  # Recognize URLs in the input text. Actually, URL is obsolete standard
17
20
  # and this code should be rewritten to use the URI concept.
18
21
  #
@@ -34,6 +37,16 @@ module Greeb::Parser
34
37
  scan(text, EMAIL, :email)
35
38
  end
36
39
 
40
+ # Recognize abbreviations in the input text.
41
+ #
42
+ # @param text [String] input text.
43
+ #
44
+ # @return [Array<Greeb::Entity>] found abbreviations.
45
+ #
46
+ def abbrevs(text)
47
+ scan(text, ABBREV, :abbrev)
48
+ end
49
+
37
50
  private
38
51
  # Implementation of regexp-based {Greeb::Entity} scanner.
39
52
  #
@@ -57,6 +57,20 @@ module Greeb::Tokenizer
57
57
  scanner.terminate
58
58
  end
59
59
 
60
+ # Split one line into characters array, but also combine duplicated
61
+ # characters.
62
+ #
63
+ # For instance, `"a b\n\n\nc"` would be transformed into the following
64
+ # array: `["a", " ", "b", "\n\n\n", "c"]`.
65
+ #
66
+ # @param token [String] a token to be splitted.
67
+ #
68
+ # @return [Array<String>] splitted characters.
69
+ #
70
+ def split(token)
71
+ token.scan(/((.|\n)\2*)/).map(&:first)
72
+ end
73
+
60
74
  protected
61
75
  # One iteration of the tokenization process.
62
76
  #
@@ -115,18 +129,4 @@ module Greeb::Tokenizer
115
129
  before + s.length
116
130
  end
117
131
  end
118
-
119
- # Split one line into characters array, but also combine line breaks
120
- # into single elements.
121
- #
122
- # For instance, `"a b\n\n\nc"` would be transformed into the following
123
- # array: `["a", " ", "b", "\n\n\n", "c"]`.
124
- #
125
- # @param token [String] a token to be splitted.
126
- #
127
- # @return [Array<String>] splitted characters.
128
- #
129
- def split(token)
130
- token.scan(/((.|\n)\2*)/).map(&:first)
131
- end
132
132
  end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.0.rc1'
8
+ VERSION = '0.2.0.rc2'
9
9
  end
data/spec/parser_spec.rb CHANGED
@@ -5,9 +5,9 @@ require_relative 'spec_helper'
5
5
  module Greeb
6
6
  describe Parser do
7
7
  let(:text) do
8
- 'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
8
+ 'Hello there! My name is Vasya B. My website is: http://вася.рф/. ' \
9
9
  'And my e-mail is example@example.com! Also it is available by ' \
10
- 'URL: http://vasya.ru.'
10
+ 'URL: http://vasya.ru. Also, G.L.H.F. everyone!'
11
11
  end
12
12
 
13
13
  describe 'URL' do
@@ -15,8 +15,8 @@ module Greeb
15
15
 
16
16
  it 'recognizes URLs' do
17
17
  subject.must_equal(
18
- [Entity.new(46, 61, :url),
19
- Entity.new(130, 145, :url)]
18
+ [Entity.new(48, 63, :url),
19
+ Entity.new(132, 147, :url)]
20
20
  )
21
21
  end
22
22
  end
@@ -26,7 +26,18 @@ module Greeb
26
26
 
27
27
  it 'recognizes e-mails' do
28
28
  subject.must_equal(
29
- [Entity.new(80, 99, :email)]
29
+ [Entity.new(82, 101, :email)]
30
+ )
31
+ end
32
+ end
33
+
34
+ describe 'ABBREV' do
35
+ subject { Parser.abbrevs(text) }
36
+
37
+ it 'recognizes abbreviations' do
38
+ subject.must_equal(
39
+ [Entity.new(30, 32, :abbrev),
40
+ Entity.new(155, 163, :abbrev)]
30
41
  )
31
42
  end
32
43
  end
@@ -4,7 +4,7 @@ require 'open3'
4
4
 
5
5
  # http://dota2.ru/guides/880-invokirkhakha-sanstrajk-ni-azhydal-da/
6
6
  #
7
- class MiniTest::Unit::TestCase
7
+ class MiniTest::Test
8
8
  # Quas Wex Exort.
9
9
  #
10
10
  def invoke_cache
@@ -79,5 +79,20 @@ module Greeb
79
79
  )
80
80
  end
81
81
  end
82
+
83
+ describe '.split' do
84
+ it 'should split characters' do
85
+ Tokenizer.split('loh').must_equal %w(l o h)
86
+ end
87
+
88
+ it 'should combine duplicated characters' do
89
+ Tokenizer.split('foo').must_equal %w(f oo)
90
+ end
91
+
92
+ it 'should also deal with line breaks' do
93
+ Tokenizer.split("bar\n\nbaz").must_equal(
94
+ [*%w(b a r), "\n\n", *%w(b a z)])
95
+ end
96
+ end
82
97
  end
83
98
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.rc1
4
+ version: 0.2.0.rc2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-05-05 00:00:00.000000000 Z
11
+ date: 2013-05-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -28,16 +28,16 @@ dependencies:
28
28
  name: minitest
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
- version: '2.11'
33
+ version: '5.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
- version: '2.11'
40
+ version: '5.0'
41
41
  description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
42
42
  written in Ruby.
43
43
  email: