greeb 0.1.2 → 0.2.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9320c86f24d2f6a40459a3bb6be6a8eedb21a822
4
- data.tar.gz: 810aec1c57162b502c1fb0dd9568f98a75690f6d
3
+ metadata.gz: a2f7bcbf04e2f79d8b703f67cb79de1ac65bef94
4
+ data.tar.gz: 3af3b46980820730acbb00cdd2c2ff0ae9fd91d4
5
5
  SHA512:
6
- metadata.gz: fbde05e3be7f071c9c7095bd8dfb1d1373694612ddcba2bc2a31014a02f2db6b1ed989c3e0d92f17076282bd8471b3c8bd8ed8325d9beaa26e0c68c887b24d86
7
- data.tar.gz: c2f613064ce1cebf4e39212c96a5861d86e269a5c3593622e3ec06dbd82d238982e3bf5f0aaa050bd3807021c8cc7d46ce569cba75a33eae56ff3a19fd0fd884
6
+ metadata.gz: 2ee73b3c4a6fc65dcf9b4b30728fe7d14f3e271f339913d6353e8e8ff15c5844f65848bbeb13865573ac6514953860334bb68bdf7793b6fc5d28cd05b23bdb99
7
+ data.tar.gz: 2ce4047f35268480ccc357eb7152623ffc2379dd7bc4c2e24169ece3390d8b02c20ae45d5874b9cb2a052571cfb820af3daf947c7215faed59c19070e8e276f1
data/lib/greeb.rb CHANGED
@@ -43,3 +43,4 @@ end
43
43
  require 'greeb/strscan'
44
44
  require 'greeb/tokenizer'
45
45
  require 'greeb/segmentator'
46
+ require 'greeb/parser'
@@ -0,0 +1,56 @@
1
+ # encoding: utf-8
2
+
3
+ # It is often necessary to find different entities in natural language
4
+ # text. These entities are URLs, e-mail addresses, names, etc. This module
5
+ # includes several helpers that could help to solve these problems.
6
+ #
7
+ module Greeb::Parser
8
+ extend self
9
+
10
+ # URL pattern. Not so precise, but IDN-compatible.
11
+ URL = /\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|\/)))/ui
12
+
13
+ # Horrible e-mail pattern.
14
+ EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/ui
15
+
16
+ # Recognize URLs in the input text. Actually, URL is obsolete standard
17
+ # and this code should be rewritten to use the URI concept.
18
+ #
19
+ # @param text [String] input text.
20
+ #
21
+ # @return [Array<Greeb::Entity>] found URLs.
22
+ #
23
+ def urls(text)
24
+ scan(text, URL, :url)
25
+ end
26
+
27
+ # Recognize e-mail addresses in the input text.
28
+ #
29
+ # @param text [String] input text.
30
+ #
31
+ # @return [Array<Greeb::Entity>] found e-mail addresses.
32
+ #
33
+ def emails(text)
34
+ scan(text, EMAIL, :email)
35
+ end
36
+
37
+ private
38
+ # Implementation of regexp-based {Greeb::Entity} scanner.
39
+ #
40
+ # @param text [String] input text.
41
+ # @param regexp [Regexp] regular expression to be used.
42
+ # @param type [Symbol] type field for the new {Greeb::Entity} instances.
43
+ # @param offset [Fixnum] offset of the next match.
44
+ #
45
+ # @return [Array<Greeb::Entity>] found entities.
46
+ #
47
+ def scan(text, regexp, type, offset = 0)
48
+ Array.new.tap do |matches|
49
+ while text and md = text.match(regexp)
50
+ start, stop = md.offset(0)
51
+ matches << Greeb::Entity.new(offset + start, offset + stop, type)
52
+ text, offset = text[stop + 1..-1], offset + stop
53
+ end
54
+ end
55
+ end
56
+ end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.1.2'
8
+ VERSION = '0.2.0.pre1'
9
9
  end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ module Greeb
6
+ describe Parser do
7
+ let(:text) do
8
+ 'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
9
+ 'And my e-mail is example@example.com! Also it is available by ' \
10
+ 'URL: http://vasya.ru.'
11
+ end
12
+
13
+ describe 'URL' do
14
+ subject { Parser.urls(text) }
15
+
16
+ it 'recognizes URLs' do
17
+ subject.must_equal(
18
+ [Entity.new(46, 61, :url),
19
+ Entity.new(129, 144, :url)]
20
+ )
21
+ end
22
+ end
23
+
24
+ describe 'EMAIL' do
25
+ subject { Parser.emails(text) }
26
+
27
+ it 'recognizes e-mails' do
28
+ subject.must_equal(
29
+ [Entity.new(80, 99, :email)]
30
+ )
31
+ end
32
+ end
33
+ end
34
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0.pre1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-04-08 00:00:00.000000000 Z
11
+ date: 2013-04-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -83,10 +83,12 @@ files:
83
83
  - Rakefile
84
84
  - greeb.gemspec
85
85
  - lib/greeb.rb
86
+ - lib/greeb/parser.rb
86
87
  - lib/greeb/segmentator.rb
87
88
  - lib/greeb/strscan.rb
88
89
  - lib/greeb/tokenizer.rb
89
90
  - lib/greeb/version.rb
91
+ - spec/parser_spec.rb
90
92
  - spec/segmentator_spec.rb
91
93
  - spec/spec_helper.rb
92
94
  - spec/tokenizer_spec.rb
@@ -104,9 +106,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
104
106
  version: '0'
105
107
  required_rubygems_version: !ruby/object:Gem::Requirement
106
108
  requirements:
107
- - - '>='
109
+ - - '>'
108
110
  - !ruby/object:Gem::Version
109
- version: '0'
111
+ version: 1.3.1
110
112
  requirements: []
111
113
  rubyforge_project: greeb
112
114
  rubygems_version: 2.0.3
@@ -114,6 +116,7 @@ signing_key:
114
116
  specification_version: 4
115
117
  summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
116
118
  test_files:
119
+ - spec/parser_spec.rb
117
120
  - spec/segmentator_spec.rb
118
121
  - spec/spec_helper.rb
119
122
  - spec/tokenizer_spec.rb