extractors 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.markdown +45 -0
- data/Rakefile +9 -0
- data/extractors.gemspec +24 -0
- data/lib/extractors.rb +174 -0
- data/lib/extractors/extractor.rb +45 -0
- data/lib/extractors/version.rb +3 -0
- data/specs/extractors_spec.rb +81 -0
- data/specs/spec_helper.rb +3 -0
- metadata +77 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
### Extract user accounts from different contexts...
|
2
|
+
|
3
|
+
```ruby
|
4
|
+
Extractors(:twitter).sanitize("http://twitter.com/#!/programmingshit") # => "programmingshit"
|
5
|
+
Extractors(:twitter).sanitize("programmingshit") # => "programmingshit"
|
6
|
+
```
|
7
|
+
|
8
|
+
### ... and format them back
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
Extractors(:twitter).format("programmingshit") # => "http://twitter.com/#!/programmingshit"
|
12
|
+
```
|
13
|
+
|
14
|
+
### Add your own extractors
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
Extractors.add :lookatme do
|
18
|
+
username_regexp = '[a-zA-Z0-9_\\-]{3,20}'
|
19
|
+
sanitizer [
|
20
|
+
%r{^(?:http://)?(?:www\.)?lookatme\.ru/users/(#{username_regexp})},
|
21
|
+
%r{^(#{username_regexp})$}
|
22
|
+
]
|
23
|
+
formatter 'http://lookatme.ru/users/%s'
|
24
|
+
end
|
25
|
+
```
|
26
|
+
|
27
|
+
### List of available extractors
|
28
|
+
|
29
|
+
* :facebook
|
30
|
+
* :googleplus
|
31
|
+
* :gtalk
|
32
|
+
* :lastfm
|
33
|
+
* :livejournal
|
34
|
+
* :lookatme
|
35
|
+
* :moikrug
|
36
|
+
* :myspace
|
37
|
+
* :skype
|
38
|
+
* :tumblr
|
39
|
+
* :twitter
|
40
|
+
* :vkontakte
|
41
|
+
* :youtube
|
42
|
+
|
43
|
+
### Credits
|
44
|
+
|
45
|
+
The code itself has been *extracted* from <http://github.com/toy/contacts> gem.
|
data/Rakefile
ADDED
data/extractors.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "extractors/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "extractors"
|
7
|
+
s.version = Extractors::VERSION
|
8
|
+
s.authors = ["macovsky"]
|
9
|
+
s.email = ["robotector@gmail.com"]
|
10
|
+
s.homepage = "http://github.com/macovsky/extractors"
|
11
|
+
s.summary = %q{Extract user accounts from different urls: tumblr, facebook and others.}
|
12
|
+
s.description = %q{}
|
13
|
+
|
14
|
+
s.rubyforge_project = "extractors"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "minitest"
|
23
|
+
s.add_development_dependency "rake"
|
24
|
+
end
|
data/lib/extractors.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'extractors/version'
|
2
|
+
require 'extractors/extractor'
|
3
|
+
|
4
|
+
def Extractors (name)
|
5
|
+
Extractors.extractors[name.to_sym]
|
6
|
+
end
|
7
|
+
|
8
|
+
module Extractors
|
9
|
+
class <<self
|
10
|
+
def add(name, &block)
|
11
|
+
extractor = Extractor.new
|
12
|
+
extractor.instance_eval(&block)
|
13
|
+
extractors[name.to_sym] = extractor
|
14
|
+
end
|
15
|
+
|
16
|
+
def extractors
|
17
|
+
@extractors ||= {}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
add :facebook do
|
22
|
+
username_regexp = '[0-9]{1,25}'
|
23
|
+
literal_username_regexp = '[a-zA-Z.]{1,25}'
|
24
|
+
sanitizer [
|
25
|
+
%r{^(?:http://)?(?:www\.)?facebook\.com/(#{literal_username_regexp})},
|
26
|
+
%r{^(#{literal_username_regexp})$},
|
27
|
+
%r{^(?:http://)?(?:www\.)?facebook\.com/profile.php\?id=(#{username_regexp})},
|
28
|
+
%r{^(?:http://)?(?:www\.)?facebook\.com/group.php\?gid=(#{username_regexp})},
|
29
|
+
%r{^(#{username_regexp})$}
|
30
|
+
]
|
31
|
+
formatter do |value|
|
32
|
+
if value[%r{^\d+$}]
|
33
|
+
"http://facebook.com/profile.php?id=#{value}"
|
34
|
+
else
|
35
|
+
"http://facebook.com/#{value}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# add :flickr do
|
41
|
+
# username_regexp = '[\\-a-zA-Z0-9_@]{1,50}'
|
42
|
+
# sanitizer [
|
43
|
+
# %r{^(?:http://)?(?:www\.)?flickr\.com/(?:photos|people)/(#{username_regexp})},
|
44
|
+
# %r{^(#{username_regexp})$}
|
45
|
+
# ]
|
46
|
+
# formatter 'http://flickr.com/photos/%s'
|
47
|
+
# end
|
48
|
+
|
49
|
+
add :googleplus do
|
50
|
+
sanitizer %r{^(?:https?://)?plus.google.com/(?:u/\d/)?(\d+)}
|
51
|
+
formatter 'https://plus.google.com/%s/posts'
|
52
|
+
end
|
53
|
+
|
54
|
+
add :gtalk do
|
55
|
+
sanitizer do |value|
|
56
|
+
username_regexp = '[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)*'
|
57
|
+
if result = value[%r{^(#{username_regexp})(?:@gmail\.com)?$}, 1]
|
58
|
+
"#{result}@gmail.com" if result && 6..30 === result.length
|
59
|
+
elsif result = value[%r{^#{username_regexp}@[a-z0-9]+(\.[a-z0-9]+)+$}]
|
60
|
+
"#{result}" if result
|
61
|
+
end
|
62
|
+
end
|
63
|
+
formatter 'gtalk:chat?jid=%s'
|
64
|
+
end
|
65
|
+
|
66
|
+
# add :icq do
|
67
|
+
# sanitizer %r{^\d+$}
|
68
|
+
# formatter 'http://icq.com/%s'
|
69
|
+
# end
|
70
|
+
|
71
|
+
add :lastfm do
|
72
|
+
username_regexp = '[a-zA-Z][_a-zA-Z0-9\\-]{1,20}'
|
73
|
+
sanitizer [
|
74
|
+
%r{^(?:http://)?(?:www\.)?last\.fm/user/(#{username_regexp})},
|
75
|
+
%r{^(#{username_regexp})$}
|
76
|
+
]
|
77
|
+
formatter 'http://last.fm/user/%s'
|
78
|
+
end
|
79
|
+
|
80
|
+
add :livejournal do
|
81
|
+
sanitizer do |value|
|
82
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,20}'
|
83
|
+
result = value_from_matching_regexp(value, [
|
84
|
+
%r{^(?:http://)?(?:users|community)\.livejournal\.com/(#{username_regexp})},
|
85
|
+
%r{^(?:http://)?(#{username_regexp})\.livejournal\.com},
|
86
|
+
%r{^(#{username_regexp})$}
|
87
|
+
])
|
88
|
+
result.gsub('_', '-') if result && result != 'www'
|
89
|
+
end
|
90
|
+
formatter 'http://%s.livejournal.com/'
|
91
|
+
end
|
92
|
+
|
93
|
+
add :lookatme do
|
94
|
+
username_regexp = '[a-zA-Z0-9_\\-]{3,20}'
|
95
|
+
sanitizer [
|
96
|
+
%r{^(?:http://)?(?:www\.)?lookatme\.ru/users/(#{username_regexp})},
|
97
|
+
%r{^(#{username_regexp})$}
|
98
|
+
]
|
99
|
+
formatter 'http://lookatme.ru/users/%s'
|
100
|
+
end
|
101
|
+
|
102
|
+
add :moikrug do
|
103
|
+
sanitizer do |value|
|
104
|
+
username_regexp = '[a-zA-Z0-9][a-zA-Z0-9_\\-]{1,20}'
|
105
|
+
result = value_from_matching_regexp(value, [
|
106
|
+
%r{^(?:http://)?(#{username_regexp})\.moikrug\.ru},
|
107
|
+
%r{^(#{username_regexp})$}
|
108
|
+
])
|
109
|
+
result.gsub('_', '-') if result && result != 'www'
|
110
|
+
end
|
111
|
+
formatter 'http://%s.moikrug.ru/'
|
112
|
+
end
|
113
|
+
|
114
|
+
add :myspace do
|
115
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
|
116
|
+
sanitizer [
|
117
|
+
%r{^(?:http://)?(?:www\.)?myspace\.com/(#{username_regexp})},
|
118
|
+
%r{^(#{username_regexp})$}
|
119
|
+
]
|
120
|
+
formatter 'http://myspace.com/%s'
|
121
|
+
end
|
122
|
+
|
123
|
+
add :skype do
|
124
|
+
sanitizer %r{^[a-z][a-z0-9_,.\-]{5,31}$}i
|
125
|
+
formatter 'skype:%s?userinfo'
|
126
|
+
end
|
127
|
+
|
128
|
+
add :tumblr do
|
129
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
|
130
|
+
sanitizer [
|
131
|
+
%r{^(?:http://)?(?:www\.)tumblr\.com/blog/(#{username_regexp})},
|
132
|
+
%r{^(?:http://)?(#{username_regexp})\.tumblr\.com},
|
133
|
+
%r{^(#{username_regexp})$}
|
134
|
+
]
|
135
|
+
formatter 'http://%s.tumblr.com'
|
136
|
+
end
|
137
|
+
|
138
|
+
add :twitter do
|
139
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
|
140
|
+
sanitizer [
|
141
|
+
%r{^(?:https?://)?(?:www\.)?twitter\.com/(?:#!/)?(#{username_regexp})},
|
142
|
+
%r{^(#{username_regexp})$}
|
143
|
+
]
|
144
|
+
formatter 'http://twitter.com/#!/%s'
|
145
|
+
end
|
146
|
+
|
147
|
+
add :vkontakte do
|
148
|
+
id_regexp = '[0-9]{1,25}'
|
149
|
+
username_regexp = '[a-zA-Z][a-zA-Z0-9_\.]{4,}'
|
150
|
+
sanitizer [
|
151
|
+
%r{^(?:http://)?(?:www\.)?vkontakte\.ru/id(#{id_regexp})},
|
152
|
+
%r{^(#{id_regexp})$},
|
153
|
+
%r{^(?:http://)?(?:www\.)?vkontakte\.ru/(#{username_regexp})},
|
154
|
+
%r{^(#{username_regexp})$},
|
155
|
+
]
|
156
|
+
formatter do |value|
|
157
|
+
if value[/\D/]
|
158
|
+
"http://vkontakte.ru/#{value}"
|
159
|
+
else
|
160
|
+
"http://vkontakte.ru/id#{value}"
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
add :youtube do
|
166
|
+
username_regexp = '[a-zA-Z0-9_\\-]{1,20}'
|
167
|
+
sanitizer [
|
168
|
+
%r{^(?:http://)?(?:www\.)?youtube\.com/user/(#{username_regexp})},
|
169
|
+
%r{^(#{username_regexp})$}
|
170
|
+
]
|
171
|
+
formatter 'http://youtube.com/user/%s'
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Extractors
|
2
|
+
class Extractor
|
3
|
+
def sanitizer(arg = nil, &block)
|
4
|
+
@sanitizer = arg || block
|
5
|
+
end
|
6
|
+
|
7
|
+
def sanitize(value)
|
8
|
+
case @sanitizer
|
9
|
+
when Regexp
|
10
|
+
value_from_matching_regexp(value, [@sanitizer])
|
11
|
+
when Array
|
12
|
+
value_from_matching_regexp(value, @sanitizer)
|
13
|
+
when Proc
|
14
|
+
@sanitizer.call(value)
|
15
|
+
when nil
|
16
|
+
value
|
17
|
+
else
|
18
|
+
raise "Unknown type of sanitizer: #{@sanitizer.inspect}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def formatter(arg = nil, &block)
|
23
|
+
@formatter = arg || block
|
24
|
+
end
|
25
|
+
|
26
|
+
def format(value)
|
27
|
+
case @formatter
|
28
|
+
when String
|
29
|
+
@formatter % value
|
30
|
+
when Proc
|
31
|
+
@formatter.call(value)
|
32
|
+
when nil
|
33
|
+
value
|
34
|
+
else
|
35
|
+
raise "Unknown type of formatter: #{@formatter.inspect}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def value_from_matching_regexp(value, regexps)
|
42
|
+
regexps.find{ |regexp| regexp === value } && $1 || $&
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Extractors do
|
4
|
+
it "can reach extractors by a shortcut" do
|
5
|
+
Extractors(:skype).must_be_instance_of Extractors::Extractor
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should have default extractors" do
|
9
|
+
Extractors.extractors.wont_be_empty
|
10
|
+
Extractors.extractors.keys.each do |extractor|
|
11
|
+
Extractors.extractors[extractor].must_be_instance_of Extractors::Extractor
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it "can add new extractors" do
|
16
|
+
Extractors.add :test do
|
17
|
+
end
|
18
|
+
extractor = Extractors(:test)
|
19
|
+
|
20
|
+
extractor.wont_be_nil
|
21
|
+
extractor.must_respond_to(:sanitize)
|
22
|
+
extractor.must_respond_to(:format)
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "existing extractors" do
|
26
|
+
{
|
27
|
+
:tumblr => {
|
28
|
+
:sanitize => {
|
29
|
+
"http://robotector.tumblr.com" => "robotector",
|
30
|
+
"http://www.tumblr.com/blog/robotector" => "robotector",
|
31
|
+
"robotector" => "robotector",
|
32
|
+
},
|
33
|
+
:format => {
|
34
|
+
"robotector" => "http://robotector.tumblr.com"
|
35
|
+
}
|
36
|
+
},
|
37
|
+
:lookatme => {
|
38
|
+
:sanitize => {
|
39
|
+
"lookatme.ru/users/macovsky" => "macovsky",
|
40
|
+
"macovsky" => "macovsky",
|
41
|
+
},
|
42
|
+
:format => {
|
43
|
+
"macovsky" => "http://lookatme.ru/users/macovsky"
|
44
|
+
}
|
45
|
+
},
|
46
|
+
:twitter => {
|
47
|
+
:sanitize => {
|
48
|
+
"twitter.com/robotector" => "robotector",
|
49
|
+
"https://twitter.com/#!/robotector" => "robotector",
|
50
|
+
"http://twitter.com/#!/robotector" => "robotector"
|
51
|
+
},
|
52
|
+
:format => {
|
53
|
+
"robotector" => "http://twitter.com/#!/robotector"
|
54
|
+
}
|
55
|
+
},
|
56
|
+
:googleplus => {
|
57
|
+
:sanitize => {
|
58
|
+
"https://plus.google.com/u/0/103751848505965231255/posts" => "103751848505965231255",
|
59
|
+
"https://plus.google.com/u/0/103751848505965231255" => "103751848505965231255",
|
60
|
+
"https://plus.google.com/103751848505965231255/posts" => "103751848505965231255",
|
61
|
+
},
|
62
|
+
:format => {
|
63
|
+
"103751848505965231255" => "https://plus.google.com/103751848505965231255/posts"
|
64
|
+
}
|
65
|
+
}
|
66
|
+
}.each do |name, sanitize_and_format|
|
67
|
+
(sanitize_and_format[:sanitize] || {}).merge(nil => nil).each do |k, v|
|
68
|
+
it "#{name} should sanitize properly" do
|
69
|
+
Extractors(name).sanitize(k).must_equal(v)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
(sanitize_and_format[:format] || {}).each do |k, v|
|
74
|
+
it "#{name} should format properly" do
|
75
|
+
Extractors(name).format(k).must_equal(v)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: extractors
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- macovsky
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-22 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: minitest
|
16
|
+
requirement: &70126699361480 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70126699361480
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rake
|
27
|
+
requirement: &70126699357780 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70126699357780
|
36
|
+
description: ''
|
37
|
+
email:
|
38
|
+
- robotector@gmail.com
|
39
|
+
executables: []
|
40
|
+
extensions: []
|
41
|
+
extra_rdoc_files: []
|
42
|
+
files:
|
43
|
+
- .gitignore
|
44
|
+
- Gemfile
|
45
|
+
- README.markdown
|
46
|
+
- Rakefile
|
47
|
+
- extractors.gemspec
|
48
|
+
- lib/extractors.rb
|
49
|
+
- lib/extractors/extractor.rb
|
50
|
+
- lib/extractors/version.rb
|
51
|
+
- specs/extractors_spec.rb
|
52
|
+
- specs/spec_helper.rb
|
53
|
+
homepage: http://github.com/macovsky/extractors
|
54
|
+
licenses: []
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ! '>='
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '0'
|
65
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ! '>='
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
requirements: []
|
72
|
+
rubyforge_project: extractors
|
73
|
+
rubygems_version: 1.8.10
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: ! 'Extract user accounts from different urls: tumblr, facebook and others.'
|
77
|
+
test_files: []
|