greeb 0.1.2 → 0.2.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/greeb.rb +1 -0
- data/lib/greeb/parser.rb +56 -0
- data/lib/greeb/version.rb +1 -1
- data/spec/parser_spec.rb +34 -0
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2f7bcbf04e2f79d8b703f67cb79de1ac65bef94
|
4
|
+
data.tar.gz: 3af3b46980820730acbb00cdd2c2ff0ae9fd91d4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ee73b3c4a6fc65dcf9b4b30728fe7d14f3e271f339913d6353e8e8ff15c5844f65848bbeb13865573ac6514953860334bb68bdf7793b6fc5d28cd05b23bdb99
|
7
|
+
data.tar.gz: 2ce4047f35268480ccc357eb7152623ffc2379dd7bc4c2e24169ece3390d8b02c20ae45d5874b9cb2a052571cfb820af3daf947c7215faed59c19070e8e276f1
|
data/lib/greeb.rb
CHANGED
data/lib/greeb/parser.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# It is often necessary to find different entities in natural language
|
4
|
+
# text. These entities are URLs, e-mail addresses, names, etc. This module
|
5
|
+
# includes several helpers that could help to solve these problems.
|
6
|
+
#
|
7
|
+
module Greeb::Parser
|
8
|
+
extend self
|
9
|
+
|
10
|
+
# URL pattern. Not so precise, but IDN-compatible.
|
11
|
+
URL = /\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|\/)))/ui
|
12
|
+
|
13
|
+
# Horrible e-mail pattern.
|
14
|
+
EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/ui
|
15
|
+
|
16
|
+
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
17
|
+
# and this code should be rewritten to use the URI concept.
|
18
|
+
#
|
19
|
+
# @param text [String] input text.
|
20
|
+
#
|
21
|
+
# @return [Array<Greeb::Entity>] found URLs.
|
22
|
+
#
|
23
|
+
def urls(text)
|
24
|
+
scan(text, URL, :url)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Recognize e-mail addresses in the input text.
|
28
|
+
#
|
29
|
+
# @param text [String] input text.
|
30
|
+
#
|
31
|
+
# @return [Array<Greeb::Entity>] found e-mail addresses.
|
32
|
+
#
|
33
|
+
def emails(text)
|
34
|
+
scan(text, EMAIL, :email)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
# Implementation of regexp-based {Greeb::Entity} scanner.
|
39
|
+
#
|
40
|
+
# @param text [String] input text.
|
41
|
+
# @param regexp [Regexp] regular expression to be used.
|
42
|
+
# @param type [Symbol] type field for the new {Greeb::Entity} instances.
|
43
|
+
# @param offset [Fixnum] offset of the next match.
|
44
|
+
#
|
45
|
+
# @return [Array<Greeb::Entity>] found entities.
|
46
|
+
#
|
47
|
+
def scan(text, regexp, type, offset = 0)
|
48
|
+
Array.new.tap do |matches|
|
49
|
+
while text and md = text.match(regexp)
|
50
|
+
start, stop = md.offset(0)
|
51
|
+
matches << Greeb::Entity.new(offset + start, offset + stop, type)
|
52
|
+
text, offset = text[stop + 1..-1], offset + stop
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
module Greeb
|
6
|
+
describe Parser do
|
7
|
+
let(:text) do
|
8
|
+
'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
|
9
|
+
'And my e-mail is example@example.com! Also it is available by ' \
|
10
|
+
'URL: http://vasya.ru.'
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'URL' do
|
14
|
+
subject { Parser.urls(text) }
|
15
|
+
|
16
|
+
it 'recognizes URLs' do
|
17
|
+
subject.must_equal(
|
18
|
+
[Entity.new(46, 61, :url),
|
19
|
+
Entity.new(129, 144, :url)]
|
20
|
+
)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe 'EMAIL' do
|
25
|
+
subject { Parser.emails(text) }
|
26
|
+
|
27
|
+
it 'recognizes e-mails' do
|
28
|
+
subject.must_equal(
|
29
|
+
[Entity.new(80, 99, :email)]
|
30
|
+
)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0.pre1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-04-
|
11
|
+
date: 2013-04-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -83,10 +83,12 @@ files:
|
|
83
83
|
- Rakefile
|
84
84
|
- greeb.gemspec
|
85
85
|
- lib/greeb.rb
|
86
|
+
- lib/greeb/parser.rb
|
86
87
|
- lib/greeb/segmentator.rb
|
87
88
|
- lib/greeb/strscan.rb
|
88
89
|
- lib/greeb/tokenizer.rb
|
89
90
|
- lib/greeb/version.rb
|
91
|
+
- spec/parser_spec.rb
|
90
92
|
- spec/segmentator_spec.rb
|
91
93
|
- spec/spec_helper.rb
|
92
94
|
- spec/tokenizer_spec.rb
|
@@ -104,9 +106,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
104
106
|
version: '0'
|
105
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
108
|
requirements:
|
107
|
-
- - '
|
109
|
+
- - '>'
|
108
110
|
- !ruby/object:Gem::Version
|
109
|
-
version:
|
111
|
+
version: 1.3.1
|
110
112
|
requirements: []
|
111
113
|
rubyforge_project: greeb
|
112
114
|
rubygems_version: 2.0.3
|
@@ -114,6 +116,7 @@ signing_key:
|
|
114
116
|
specification_version: 4
|
115
117
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|
116
118
|
test_files:
|
119
|
+
- spec/parser_spec.rb
|
117
120
|
- spec/segmentator_spec.rb
|
118
121
|
- spec/spec_helper.rb
|
119
122
|
- spec/tokenizer_spec.rb
|