extractors 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.markdown +45 -0
- data/Rakefile +9 -0
- data/extractors.gemspec +24 -0
- data/lib/extractors.rb +174 -0
- data/lib/extractors/extractor.rb +45 -0
- data/lib/extractors/version.rb +3 -0
- data/specs/extractors_spec.rb +81 -0
- data/specs/spec_helper.rb +3 -0
- metadata +77 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
### Extract user accounts from different contexts...
|
2
|
+
|
3
|
+
```ruby
|
4
|
+
Extractors(:twitter).sanitize("http://twitter.com/#!/programmingshit") # => "programmingshit"
|
5
|
+
Extractors(:twitter).sanitize("programmingshit") # => "programmingshit"
|
6
|
+
```
|
7
|
+
|
8
|
+
### ... and format them back
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
Extractors(:twitter).format("programmingshit") # => "http://twitter.com/#!/programmingshit"
|
12
|
+
```
|
13
|
+
|
14
|
+
### Add your own extractors
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
Extractors.add :lookatme do
|
18
|
+
username_regexp = '[a-zA-Z0-9_\\-]{3,20}'
|
19
|
+
sanitizer [
|
20
|
+
%r{^(?:http://)?(?:www\.)?lookatme\.ru/users/(#{username_regexp})},
|
21
|
+
%r{^(#{username_regexp})$}
|
22
|
+
]
|
23
|
+
formatter 'http://lookatme.ru/users/%s'
|
24
|
+
end
|
25
|
+
```
|
26
|
+
|
27
|
+
### List of available extractors
|
28
|
+
|
29
|
+
* :facebook
|
30
|
+
* :googleplus
|
31
|
+
* :gtalk
|
32
|
+
* :lastfm
|
33
|
+
* :livejournal
|
34
|
+
* :lookatme
|
35
|
+
* :moikrug
|
36
|
+
* :myspace
|
37
|
+
* :skype
|
38
|
+
* :tumblr
|
39
|
+
* :twitter
|
40
|
+
* :vkontakte
|
41
|
+
* :youtube
|
42
|
+
|
43
|
+
### Credits
|
44
|
+
|
45
|
+
The code itself has been *extracted* from <http://github.com/toy/contacts> gem.
|
data/Rakefile
ADDED
data/extractors.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "extractors/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "extractors"
|
7
|
+
s.version = Extractors::VERSION
|
8
|
+
s.authors = ["macovsky"]
|
9
|
+
s.email = ["robotector@gmail.com"]
|
10
|
+
s.homepage = "http://github.com/macovsky/extractors"
|
11
|
+
s.summary = %q{Extract user accounts from different urls: tumblr, facebook and others.}
|
12
|
+
s.description = %q{}
|
13
|
+
|
14
|
+
s.rubyforge_project = "extractors"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "minitest"
|
23
|
+
s.add_development_dependency "rake"
|
24
|
+
end
|
data/lib/extractors.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'extractors/version'
|
2
|
+
require 'extractors/extractor'
|
3
|
+
|
4
|
+
def Extractors (name)
|
5
|
+
Extractors.extractors[name.to_sym]
|
6
|
+
end
|
7
|
+
|
8
|
+
module Extractors
|
9
|
+
class <<self
|
10
|
+
def add(name, &block)
|
11
|
+
extractor = Extractor.new
|
12
|
+
extractor.instance_eval(&block)
|
13
|
+
extractors[name.to_sym] = extractor
|
14
|
+
end
|
15
|
+
|
16
|
+
def extractors
|
17
|
+
@extractors ||= {}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
add :facebook do
|
22
|
+
username_regexp = '[0-9]{1,25}'
|
23
|
+
literal_username_regexp = '[a-zA-Z.]{1,25}'
|
24
|
+
sanitizer [
|
25
|
+
%r{^(?:http://)?(?:www\.)?facebook\.com/(#{literal_username_regexp})},
|
26
|
+
%r{^(#{literal_username_regexp})$},
|
27
|
+
%r{^(?:http://)?(?:www\.)?facebook\.com/profile.php\?id=(#{username_regexp})},
|
28
|
+
%r{^(?:http://)?(?:www\.)?facebook\.com/group.php\?gid=(#{username_regexp})},
|
29
|
+
%r{^(#{username_regexp})$}
|
30
|
+
]
|
31
|
+
formatter do |value|
|
32
|
+
if value[%r{^\d+$}]
|
33
|
+
"http://facebook.com/profile.php?id=#{value}"
|
34
|
+
else
|
35
|
+
"http://facebook.com/#{value}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# add :flickr do
|
41
|
+
# username_regexp = '[\\-a-zA-Z0-9_@]{1,50}'
|
42
|
+
# sanitizer [
|
43
|
+
# %r{^(?:http://)?(?:www\.)?flickr\.com/(?:photos|people)/(#{username_regexp})},
|
44
|
+
# %r{^(#{username_regexp})$}
|
45
|
+
# ]
|
46
|
+
# formatter 'http://flickr.com/photos/%s'
|
47
|
+
# end
|
48
|
+
|
49
|
+
add :googleplus do
|
50
|
+
sanitizer %r{^(?:https?://)?plus.google.com/(?:u/\d/)?(\d+)}
|
51
|
+
formatter 'https://plus.google.com/%s/posts'
|
52
|
+
end
|
53
|
+
|
54
|
+
add :gtalk do
|
55
|
+
sanitizer do |value|
|
56
|
+
username_regexp = '[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)*'
|
57
|
+
if result = value[%r{^(#{username_regexp})(?:@gmail\.com)?$}, 1]
|
58
|
+
"#{result}@gmail.com" if result && 6..30 === result.length
|
59
|
+
elsif result = value[%r{^#{username_regexp}@[a-z0-9]+(\.[a-z0-9]+)+$}]
|
60
|
+
"#{result}" if result
|
61
|
+
end
|
62
|
+
end
|
63
|
+
formatter 'gtalk:chat?jid=%s'
|
64
|
+
end
|
65
|
+
|
66
|
+
# add :icq do
|
67
|
+
# sanitizer %r{^\d+$}
|
68
|
+
# formatter 'http://icq.com/%s'
|
69
|
+
# end
|
70
|
+
|
71
|
+
add :lastfm do
|
72
|
+
username_regexp = '[a-zA-Z][_a-zA-Z0-9\\-]{1,20}'
|
73
|
+
sanitizer [
|
74
|
+
%r{^(?:http://)?(?:www\.)?last\.fm/user/(#{username_regexp})},
|
75
|
+
%r{^(#{username_regexp})$}
|
76
|
+
]
|
77
|
+
formatter 'http://last.fm/user/%s'
|
78
|
+
end
|
79
|
+
|
80
|
+
add :livejournal do
|
81
|
+
sanitizer do |value|
|
82
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,20}'
|
83
|
+
result = value_from_matching_regexp(value, [
|
84
|
+
%r{^(?:http://)?(?:users|community)\.livejournal\.com/(#{username_regexp})},
|
85
|
+
%r{^(?:http://)?(#{username_regexp})\.livejournal\.com},
|
86
|
+
%r{^(#{username_regexp})$}
|
87
|
+
])
|
88
|
+
result.gsub('_', '-') if result && result != 'www'
|
89
|
+
end
|
90
|
+
formatter 'http://%s.livejournal.com/'
|
91
|
+
end
|
92
|
+
|
93
|
+
add :lookatme do
|
94
|
+
username_regexp = '[a-zA-Z0-9_\\-]{3,20}'
|
95
|
+
sanitizer [
|
96
|
+
%r{^(?:http://)?(?:www\.)?lookatme\.ru/users/(#{username_regexp})},
|
97
|
+
%r{^(#{username_regexp})$}
|
98
|
+
]
|
99
|
+
formatter 'http://lookatme.ru/users/%s'
|
100
|
+
end
|
101
|
+
|
102
|
+
add :moikrug do
|
103
|
+
sanitizer do |value|
|
104
|
+
username_regexp = '[a-zA-Z0-9][a-zA-Z0-9_\\-]{1,20}'
|
105
|
+
result = value_from_matching_regexp(value, [
|
106
|
+
%r{^(?:http://)?(#{username_regexp})\.moikrug\.ru},
|
107
|
+
%r{^(#{username_regexp})$}
|
108
|
+
])
|
109
|
+
result.gsub('_', '-') if result && result != 'www'
|
110
|
+
end
|
111
|
+
formatter 'http://%s.moikrug.ru/'
|
112
|
+
end
|
113
|
+
|
114
|
+
add :myspace do
|
115
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
|
116
|
+
sanitizer [
|
117
|
+
%r{^(?:http://)?(?:www\.)?myspace\.com/(#{username_regexp})},
|
118
|
+
%r{^(#{username_regexp})$}
|
119
|
+
]
|
120
|
+
formatter 'http://myspace.com/%s'
|
121
|
+
end
|
122
|
+
|
123
|
+
add :skype do
|
124
|
+
sanitizer %r{^[a-z][a-z0-9_,.\-]{5,31}$}i
|
125
|
+
formatter 'skype:%s?userinfo'
|
126
|
+
end
|
127
|
+
|
128
|
+
add :tumblr do
|
129
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
|
130
|
+
sanitizer [
|
131
|
+
%r{^(?:http://)?(?:www\.)tumblr\.com/blog/(#{username_regexp})},
|
132
|
+
%r{^(?:http://)?(#{username_regexp})\.tumblr\.com},
|
133
|
+
%r{^(#{username_regexp})$}
|
134
|
+
]
|
135
|
+
formatter 'http://%s.tumblr.com'
|
136
|
+
end
|
137
|
+
|
138
|
+
add :twitter do
|
139
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
|
140
|
+
sanitizer [
|
141
|
+
%r{^(?:https?://)?(?:www\.)?twitter\.com/(?:#!/)?(#{username_regexp})},
|
142
|
+
%r{^(#{username_regexp})$}
|
143
|
+
]
|
144
|
+
formatter 'http://twitter.com/#!/%s'
|
145
|
+
end
|
146
|
+
|
147
|
+
add :vkontakte do
|
148
|
+
id_regexp = '[0-9]{1,25}'
|
149
|
+
username_regexp = '[a-zA-Z][a-zA-Z0-9_\.]{4,}'
|
150
|
+
sanitizer [
|
151
|
+
%r{^(?:http://)?(?:www\.)?vkontakte\.ru/id(#{id_regexp})},
|
152
|
+
%r{^(#{id_regexp})$},
|
153
|
+
%r{^(?:http://)?(?:www\.)?vkontakte\.ru/(#{username_regexp})},
|
154
|
+
%r{^(#{username_regexp})$},
|
155
|
+
]
|
156
|
+
formatter do |value|
|
157
|
+
if value[/\D/]
|
158
|
+
"http://vkontakte.ru/#{value}"
|
159
|
+
else
|
160
|
+
"http://vkontakte.ru/id#{value}"
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
add :youtube do
|
166
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,20}'
|
167
|
+
sanitizer [
|
168
|
+
%r{^(?:http://)?(?:www\.)?youtube\.com/user/(#{username_regexp})},
|
169
|
+
%r{^(#{username_regexp})$}
|
170
|
+
]
|
171
|
+
formatter 'http://youtube.com/user/%s'
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Extractors
|
2
|
+
class Extractor
|
3
|
+
def sanitizer(arg = nil, &block)
|
4
|
+
@sanitizer = arg || block
|
5
|
+
end
|
6
|
+
|
7
|
+
def sanitize(value)
|
8
|
+
case @sanitizer
|
9
|
+
when Regexp
|
10
|
+
value_from_matching_regexp(value, [@sanitizer])
|
11
|
+
when Array
|
12
|
+
value_from_matching_regexp(value, @sanitizer)
|
13
|
+
when Proc
|
14
|
+
@sanitizer.call(value)
|
15
|
+
when nil
|
16
|
+
value
|
17
|
+
else
|
18
|
+
raise "Unknown type of sanitizer: #{@sanitizer.inspect}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def formatter(arg = nil, &block)
|
23
|
+
@formatter = arg || block
|
24
|
+
end
|
25
|
+
|
26
|
+
def format(value)
|
27
|
+
case @formatter
|
28
|
+
when String
|
29
|
+
@formatter % value
|
30
|
+
when Proc
|
31
|
+
@formatter.call(value)
|
32
|
+
when nil
|
33
|
+
value
|
34
|
+
else
|
35
|
+
raise "Unknown type of formatter: #{@formatter.inspect}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def value_from_matching_regexp(value, regexps)
|
42
|
+
regexps.find{ |regexp| regexp === value } && $1 || $&
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Extractors do
|
4
|
+
it "can reach extractors by a shortcut" do
|
5
|
+
Extractors(:skype).must_be_instance_of Extractors::Extractor
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should have default extractors" do
|
9
|
+
Extractors.extractors.wont_be_empty
|
10
|
+
Extractors.extractors.keys.each do |extractor|
|
11
|
+
Extractors.extractors[extractor].must_be_instance_of Extractors::Extractor
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it "can add new extractors" do
|
16
|
+
Extractors.add :test do
|
17
|
+
end
|
18
|
+
extractor = Extractors(:test)
|
19
|
+
|
20
|
+
extractor.wont_be_nil
|
21
|
+
extractor.must_respond_to(:sanitize)
|
22
|
+
extractor.must_respond_to(:format)
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "existing extractors" do
|
26
|
+
{
|
27
|
+
:tumblr => {
|
28
|
+
:sanitize => {
|
29
|
+
"http://robotector.tumblr.com" => "robotector",
|
30
|
+
"http://www.tumblr.com/blog/robotector" => "robotector",
|
31
|
+
"robotector" => "robotector",
|
32
|
+
},
|
33
|
+
:format => {
|
34
|
+
"robotector" => "http://robotector.tumblr.com"
|
35
|
+
}
|
36
|
+
},
|
37
|
+
:lookatme => {
|
38
|
+
:sanitize => {
|
39
|
+
"lookatme.ru/users/macovsky" => "macovsky",
|
40
|
+
"macovsky" => "macovsky",
|
41
|
+
},
|
42
|
+
:format => {
|
43
|
+
"macovsky" => "http://lookatme.ru/users/macovsky"
|
44
|
+
}
|
45
|
+
},
|
46
|
+
:twitter => {
|
47
|
+
:sanitize => {
|
48
|
+
"twitter.com/robotector" => "robotector",
|
49
|
+
"https://twitter.com/#!/robotector" => "robotector",
|
50
|
+
"http://twitter.com/#!/robotector" => "robotector"
|
51
|
+
},
|
52
|
+
:format => {
|
53
|
+
"robotector" => "http://twitter.com/#!/robotector"
|
54
|
+
}
|
55
|
+
},
|
56
|
+
:googleplus => {
|
57
|
+
:sanitize => {
|
58
|
+
"https://plus.google.com/u/0/103751848505965231255/posts" => "103751848505965231255",
|
59
|
+
"https://plus.google.com/u/0/103751848505965231255" => "103751848505965231255",
|
60
|
+
"https://plus.google.com/103751848505965231255/posts" => "103751848505965231255",
|
61
|
+
},
|
62
|
+
:format => {
|
63
|
+
"103751848505965231255" => "https://plus.google.com/103751848505965231255/posts"
|
64
|
+
}
|
65
|
+
}
|
66
|
+
}.each do |name, sanitize_and_format|
|
67
|
+
(sanitize_and_format[:sanitize] || {}).merge(nil => nil).each do |k, v|
|
68
|
+
it "#{name} should sanitize properly" do
|
69
|
+
Extractors(name).sanitize(k).must_equal(v)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
(sanitize_and_format[:format] || {}).each do |k, v|
|
74
|
+
it "#{name} should format properly" do
|
75
|
+
Extractors(name).format(k).must_equal(v)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: extractors
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- macovsky
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-22 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: minitest
|
16
|
+
requirement: &70126699361480 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70126699361480
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rake
|
27
|
+
requirement: &70126699357780 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70126699357780
|
36
|
+
description: ''
|
37
|
+
email:
|
38
|
+
- robotector@gmail.com
|
39
|
+
executables: []
|
40
|
+
extensions: []
|
41
|
+
extra_rdoc_files: []
|
42
|
+
files:
|
43
|
+
- .gitignore
|
44
|
+
- Gemfile
|
45
|
+
- README.markdown
|
46
|
+
- Rakefile
|
47
|
+
- extractors.gemspec
|
48
|
+
- lib/extractors.rb
|
49
|
+
- lib/extractors/extractor.rb
|
50
|
+
- lib/extractors/version.rb
|
51
|
+
- specs/extractors_spec.rb
|
52
|
+
- specs/spec_helper.rb
|
53
|
+
homepage: http://github.com/macovsky/extractors
|
54
|
+
licenses: []
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ! '>='
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '0'
|
65
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ! '>='
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
requirements: []
|
72
|
+
rubyforge_project: extractors
|
73
|
+
rubygems_version: 1.8.10
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: ! 'Extract user accounts from different urls: tumblr, facebook and others.'
|
77
|
+
test_files: []
|