greeb 0.1.2 → 0.2.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9320c86f24d2f6a40459a3bb6be6a8eedb21a822
4
- data.tar.gz: 810aec1c57162b502c1fb0dd9568f98a75690f6d
3
+ metadata.gz: a2f7bcbf04e2f79d8b703f67cb79de1ac65bef94
4
+ data.tar.gz: 3af3b46980820730acbb00cdd2c2ff0ae9fd91d4
5
5
  SHA512:
6
- metadata.gz: fbde05e3be7f071c9c7095bd8dfb1d1373694612ddcba2bc2a31014a02f2db6b1ed989c3e0d92f17076282bd8471b3c8bd8ed8325d9beaa26e0c68c887b24d86
7
- data.tar.gz: c2f613064ce1cebf4e39212c96a5861d86e269a5c3593622e3ec06dbd82d238982e3bf5f0aaa050bd3807021c8cc7d46ce569cba75a33eae56ff3a19fd0fd884
6
+ metadata.gz: 2ee73b3c4a6fc65dcf9b4b30728fe7d14f3e271f339913d6353e8e8ff15c5844f65848bbeb13865573ac6514953860334bb68bdf7793b6fc5d28cd05b23bdb99
7
+ data.tar.gz: 2ce4047f35268480ccc357eb7152623ffc2379dd7bc4c2e24169ece3390d8b02c20ae45d5874b9cb2a052571cfb820af3daf947c7215faed59c19070e8e276f1
data/lib/greeb.rb CHANGED
@@ -43,3 +43,4 @@ end
43
43
  require 'greeb/strscan'
44
44
  require 'greeb/tokenizer'
45
45
  require 'greeb/segmentator'
46
+ require 'greeb/parser'
@@ -0,0 +1,56 @@
1
+ # encoding: utf-8
2
+
3
+ # It is often necessary to find different entities in natural language
4
+ # text. These entities are URLs, e-mail addresses, names, etc. This module
5
+ # includes several helpers that could help to solve these problems.
6
+ #
7
+ module Greeb::Parser
8
+ extend self
9
+
10
+ # URL pattern. Not so precise, but IDN-compatible.
11
+ URL = /\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|\/)))/ui
12
+
13
+ # Horrible e-mail pattern.
14
+ EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/ui
15
+
16
+ # Recognize URLs in the input text. Actually, URL is obsolete standard
17
+ # and this code should be rewritten to use the URI concept.
18
+ #
19
+ # @param text [String] input text.
20
+ #
21
+ # @return [Array<Greeb::Entity>] found URLs.
22
+ #
23
+ def urls(text)
24
+ scan(text, URL, :url)
25
+ end
26
+
27
+ # Recognize e-mail addresses in the input text.
28
+ #
29
+ # @param text [String] input text.
30
+ #
31
+ # @return [Array<Greeb::Entity>] found e-mail addresses.
32
+ #
33
+ def emails(text)
34
+ scan(text, EMAIL, :email)
35
+ end
36
+
37
+ private
38
+ # Implementation of regexp-based {Greeb::Entity} scanner.
39
+ #
40
+ # @param text [String] input text.
41
+ # @param regexp [Regexp] regular expression to be used.
42
+ # @param type [Symbol] type field for the new {Greeb::Entity} instances.
43
+ # @param offset [Fixnum] offset of the next match.
44
+ #
45
+ # @return [Array<Greeb::Entity>] found entities.
46
+ #
47
+ def scan(text, regexp, type, offset = 0)
48
+ Array.new.tap do |matches|
49
+ while text and md = text.match(regexp)
50
+ start, stop = md.offset(0)
51
+ matches << Greeb::Entity.new(offset + start, offset + stop, type)
52
+ text, offset = text[stop + 1..-1], offset + stop
53
+ end
54
+ end
55
+ end
56
+ end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.1.2'
8
+ VERSION = '0.2.0.pre1'
9
9
  end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ module Greeb
6
+ describe Parser do
7
+ let(:text) do
8
+ 'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
9
+ 'And my e-mail is example@example.com! Also it is available by ' \
10
+ 'URL: http://vasya.ru.'
11
+ end
12
+
13
+ describe 'URL' do
14
+ subject { Parser.urls(text) }
15
+
16
+ it 'recognizes URLs' do
17
+ subject.must_equal(
18
+ [Entity.new(46, 61, :url),
19
+ Entity.new(129, 144, :url)]
20
+ )
21
+ end
22
+ end
23
+
24
+ describe 'EMAIL' do
25
+ subject { Parser.emails(text) }
26
+
27
+ it 'recognizes e-mails' do
28
+ subject.must_equal(
29
+ [Entity.new(80, 99, :email)]
30
+ )
31
+ end
32
+ end
33
+ end
34
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0.pre1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-04-08 00:00:00.000000000 Z
11
+ date: 2013-04-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -83,10 +83,12 @@ files:
83
83
  - Rakefile
84
84
  - greeb.gemspec
85
85
  - lib/greeb.rb
86
+ - lib/greeb/parser.rb
86
87
  - lib/greeb/segmentator.rb
87
88
  - lib/greeb/strscan.rb
88
89
  - lib/greeb/tokenizer.rb
89
90
  - lib/greeb/version.rb
91
+ - spec/parser_spec.rb
90
92
  - spec/segmentator_spec.rb
91
93
  - spec/spec_helper.rb
92
94
  - spec/tokenizer_spec.rb
@@ -104,9 +106,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
104
106
  version: '0'
105
107
  required_rubygems_version: !ruby/object:Gem::Requirement
106
108
  requirements:
107
- - - '>='
109
+ - - '>'
108
110
  - !ruby/object:Gem::Version
109
- version: '0'
111
+ version: 1.3.1
110
112
  requirements: []
111
113
  rubyforge_project: greeb
112
114
  rubygems_version: 2.0.3
@@ -114,6 +116,7 @@ signing_key:
114
116
  specification_version: 4
115
117
  summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
116
118
  test_files:
119
+ - spec/parser_spec.rb
117
120
  - spec/segmentator_spec.rb
118
121
  - spec/spec_helper.rb
119
122
  - spec/tokenizer_spec.rb