greeb 0.1.2 → 0.2.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/greeb.rb +1 -0
- data/lib/greeb/parser.rb +56 -0
- data/lib/greeb/version.rb +1 -1
- data/spec/parser_spec.rb +34 -0
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2f7bcbf04e2f79d8b703f67cb79de1ac65bef94
|
4
|
+
data.tar.gz: 3af3b46980820730acbb00cdd2c2ff0ae9fd91d4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ee73b3c4a6fc65dcf9b4b30728fe7d14f3e271f339913d6353e8e8ff15c5844f65848bbeb13865573ac6514953860334bb68bdf7793b6fc5d28cd05b23bdb99
|
7
|
+
data.tar.gz: 2ce4047f35268480ccc357eb7152623ffc2379dd7bc4c2e24169ece3390d8b02c20ae45d5874b9cb2a052571cfb820af3daf947c7215faed59c19070e8e276f1
|
data/lib/greeb.rb
CHANGED
data/lib/greeb/parser.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# It is often necessary to find different entities in natural language
|
4
|
+
# text. These entities are URLs, e-mail addresses, names, etc. This module
|
5
|
+
# includes several helpers that could help to solve these problems.
|
6
|
+
#
|
7
|
+
module Greeb::Parser
|
8
|
+
extend self
|
9
|
+
|
10
|
+
# URL pattern. Not so precise, but IDN-compatible.
|
11
|
+
URL = /\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|\/)))/ui
|
12
|
+
|
13
|
+
# Horrible e-mail pattern.
|
14
|
+
EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/ui
|
15
|
+
|
16
|
+
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
17
|
+
# and this code should be rewritten to use the URI concept.
|
18
|
+
#
|
19
|
+
# @param text [String] input text.
|
20
|
+
#
|
21
|
+
# @return [Array<Greeb::Entity>] found URLs.
|
22
|
+
#
|
23
|
+
def urls(text)
|
24
|
+
scan(text, URL, :url)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Recognize e-mail addresses in the input text.
|
28
|
+
#
|
29
|
+
# @param text [String] input text.
|
30
|
+
#
|
31
|
+
# @return [Array<Greeb::Entity>] found e-mail addresses.
|
32
|
+
#
|
33
|
+
def emails(text)
|
34
|
+
scan(text, EMAIL, :email)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
# Implementation of regexp-based {Greeb::Entity} scanner.
|
39
|
+
#
|
40
|
+
# @param text [String] input text.
|
41
|
+
# @param regexp [Regexp] regular expression to be used.
|
42
|
+
# @param type [Symbol] type field for the new {Greeb::Entity} instances.
|
43
|
+
# @param offset [Fixnum] offset of the next match.
|
44
|
+
#
|
45
|
+
# @return [Array<Greeb::Entity>] found entities.
|
46
|
+
#
|
47
|
+
def scan(text, regexp, type, offset = 0)
|
48
|
+
Array.new.tap do |matches|
|
49
|
+
while text and md = text.match(regexp)
|
50
|
+
start, stop = md.offset(0)
|
51
|
+
matches << Greeb::Entity.new(offset + start, offset + stop, type)
|
52
|
+
text, offset = text[stop + 1..-1], offset + stop
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
module Greeb
|
6
|
+
describe Parser do
|
7
|
+
let(:text) do
|
8
|
+
'Hello there! My name is Vasya. My website is: http://вася.рф/. ' \
|
9
|
+
'And my e-mail is example@example.com! Also it is available by ' \
|
10
|
+
'URL: http://vasya.ru.'
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'URL' do
|
14
|
+
subject { Parser.urls(text) }
|
15
|
+
|
16
|
+
it 'recognizes URLs' do
|
17
|
+
subject.must_equal(
|
18
|
+
[Entity.new(46, 61, :url),
|
19
|
+
Entity.new(129, 144, :url)]
|
20
|
+
)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe 'EMAIL' do
|
25
|
+
subject { Parser.emails(text) }
|
26
|
+
|
27
|
+
it 'recognizes e-mails' do
|
28
|
+
subject.must_equal(
|
29
|
+
[Entity.new(80, 99, :email)]
|
30
|
+
)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0.pre1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-04-
|
11
|
+
date: 2013-04-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -83,10 +83,12 @@ files:
|
|
83
83
|
- Rakefile
|
84
84
|
- greeb.gemspec
|
85
85
|
- lib/greeb.rb
|
86
|
+
- lib/greeb/parser.rb
|
86
87
|
- lib/greeb/segmentator.rb
|
87
88
|
- lib/greeb/strscan.rb
|
88
89
|
- lib/greeb/tokenizer.rb
|
89
90
|
- lib/greeb/version.rb
|
91
|
+
- spec/parser_spec.rb
|
90
92
|
- spec/segmentator_spec.rb
|
91
93
|
- spec/spec_helper.rb
|
92
94
|
- spec/tokenizer_spec.rb
|
@@ -104,9 +106,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
104
106
|
version: '0'
|
105
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
108
|
requirements:
|
107
|
-
- - '
|
109
|
+
- - '>'
|
108
110
|
- !ruby/object:Gem::Version
|
109
|
-
version:
|
111
|
+
version: 1.3.1
|
110
112
|
requirements: []
|
111
113
|
rubyforge_project: greeb
|
112
114
|
rubygems_version: 2.0.3
|
@@ -114,6 +116,7 @@ signing_key:
|
|
114
116
|
specification_version: 4
|
115
117
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|
116
118
|
test_files:
|
119
|
+
- spec/parser_spec.rb
|
117
120
|
- spec/segmentator_spec.rb
|
118
121
|
- spec/spec_helper.rb
|
119
122
|
- spec/tokenizer_spec.rb
|