extractors 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in extractors.gemspec
4
+ gemspec
@@ -0,0 +1,45 @@
1
+ ### Extract user accounts from different contexts...
2
+
3
+ ```ruby
4
+ Extractors(:twitter).sanitize("http://twitter.com/#!/programmingshit") # => "programmingshit"
5
+ Extractors(:twitter).sanitize("programmingshit") # => "programmingshit"
6
+ ```
7
+
8
+ ### ... and format them back
9
+
10
+ ```ruby
11
+ Extractors(:twitter).format("programmingshit") # => "http://twitter.com/#!/programmingshit"
12
+ ```
13
+
14
+ ### Add your own extractors
15
+
16
+ ```ruby
17
+ Extractors.add :lookatme do
18
+ username_regexp = '[a-zA-Z0-9_\\-]{3,20}'
19
+ sanitizer [
20
+ %r{^(?:http://)?(?:www\.)?lookatme\.ru/users/(#{username_regexp})},
21
+ %r{^(#{username_regexp})$}
22
+ ]
23
+ formatter 'http://lookatme.ru/users/%s'
24
+ end
25
+ ```
26
+
27
+ ### List of available extractors
28
+
29
+ * :facebook
30
+ * :googleplus
31
+ * :gtalk
32
+ * :lastfm
33
+ * :livejournal
34
+ * :lookatme
35
+ * :moikrug
36
+ * :myspace
37
+ * :skype
38
+ * :tumblr
39
+ * :twitter
40
+ * :vkontakte
41
+ * :youtube
42
+
43
+ ### Credits
44
+
45
+ The code itself has been *extracted* from <http://github.com/toy/contacts> gem.
@@ -0,0 +1,9 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ task :default => :test
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'lib' << 'specs'
7
+ t.test_files = FileList['specs/*_spec.rb']
8
+ t.verbose = true
9
+ end
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "extractors/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "extractors"
7
+ s.version = Extractors::VERSION
8
+ s.authors = ["macovsky"]
9
+ s.email = ["robotector@gmail.com"]
10
+ s.homepage = "http://github.com/macovsky/extractors"
11
+ s.summary = %q{Extract user accounts from different urls: tumblr, facebook and others.}
12
+ s.description = %q{}
13
+
14
+ s.rubyforge_project = "extractors"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "minitest"
23
+ s.add_development_dependency "rake"
24
+ end
@@ -0,0 +1,174 @@
1
+ require 'extractors/version'
2
+ require 'extractors/extractor'
3
+
4
+ def Extractors (name)
5
+ Extractors.extractors[name.to_sym]
6
+ end
7
+
8
+ module Extractors
9
+ class <<self
10
+ def add(name, &block)
11
+ extractor = Extractor.new
12
+ extractor.instance_eval(&block)
13
+ extractors[name.to_sym] = extractor
14
+ end
15
+
16
+ def extractors
17
+ @extractors ||= {}
18
+ end
19
+ end
20
+
21
+ add :facebook do
22
+ username_regexp = '[0-9]{1,25}'
23
+ literal_username_regexp = '[a-zA-Z.]{1,25}'
24
+ sanitizer [
25
+ %r{^(?:http://)?(?:www\.)?facebook\.com/(#{literal_username_regexp})},
26
+ %r{^(#{literal_username_regexp})$},
27
+ %r{^(?:http://)?(?:www\.)?facebook\.com/profile.php\?id=(#{username_regexp})},
28
+ %r{^(?:http://)?(?:www\.)?facebook\.com/group.php\?gid=(#{username_regexp})},
29
+ %r{^(#{username_regexp})$}
30
+ ]
31
+ formatter do |value|
32
+ if value[%r{^\d+$}]
33
+ "http://facebook.com/profile.php?id=#{value}"
34
+ else
35
+ "http://facebook.com/#{value}"
36
+ end
37
+ end
38
+ end
39
+
40
+ # add :flickr do
41
+ # username_regexp = '[\\-a-zA-Z0-9_@]{1,50}'
42
+ # sanitizer [
43
+ # %r{^(?:http://)?(?:www\.)?flickr\.com/(?:photos|people)/(#{username_regexp})},
44
+ # %r{^(#{username_regexp})$}
45
+ # ]
46
+ # formatter 'http://flickr.com/photos/%s'
47
+ # end
48
+
49
+ add :googleplus do
50
+ sanitizer %r{^(?:https?://)?plus.google.com/(?:u/\d/)?(\d+)}
51
+ formatter 'https://plus.google.com/%s/posts'
52
+ end
53
+
54
+ add :gtalk do
55
+ sanitizer do |value|
56
+ username_regexp = '[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)*'
57
+ if result = value[%r{^(#{username_regexp})(?:@gmail\.com)?$}, 1]
58
+ "#{result}@gmail.com" if result && 6..30 === result.length
59
+ elsif result = value[%r{^#{username_regexp}@[a-z0-9]+(\.[a-z0-9]+)+$}]
60
+ "#{result}" if result
61
+ end
62
+ end
63
+ formatter 'gtalk:chat?jid=%s'
64
+ end
65
+
66
+ # add :icq do
67
+ # sanitizer %r{^\d+$}
68
+ # formatter 'http://icq.com/%s'
69
+ # end
70
+
71
+ add :lastfm do
72
+ username_regexp = '[a-zA-Z][_a-zA-Z0-9\\-]{1,20}'
73
+ sanitizer [
74
+ %r{^(?:http://)?(?:www\.)?last\.fm/user/(#{username_regexp})},
75
+ %r{^(#{username_regexp})$}
76
+ ]
77
+ formatter 'http://last.fm/user/%s'
78
+ end
79
+
80
+ add :livejournal do
81
+ sanitizer do |value|
82
+ username_regexp = '[a-zA-Z0-9_\\-]{1,20}'
83
+ result = value_from_matching_regexp(value, [
84
+ %r{^(?:http://)?(?:users|community)\.livejournal\.com/(#{username_regexp})},
85
+ %r{^(?:http://)?(#{username_regexp})\.livejournal\.com},
86
+ %r{^(#{username_regexp})$}
87
+ ])
88
+ result.gsub('_', '-') if result && result != 'www'
89
+ end
90
+ formatter 'http://%s.livejournal.com/'
91
+ end
92
+
93
+ add :lookatme do
94
+ username_regexp = '[a-zA-Z0-9_\\-]{3,20}'
95
+ sanitizer [
96
+ %r{^(?:http://)?(?:www\.)?lookatme\.ru/users/(#{username_regexp})},
97
+ %r{^(#{username_regexp})$}
98
+ ]
99
+ formatter 'http://lookatme.ru/users/%s'
100
+ end
101
+
102
+ add :moikrug do
103
+ sanitizer do |value|
104
+ username_regexp = '[a-zA-Z0-9][a-zA-Z0-9_\\-]{1,20}'
105
+ result = value_from_matching_regexp(value, [
106
+ %r{^(?:http://)?(#{username_regexp})\.moikrug\.ru},
107
+ %r{^(#{username_regexp})$}
108
+ ])
109
+ result.gsub('_', '-') if result && result != 'www'
110
+ end
111
+ formatter 'http://%s.moikrug.ru/'
112
+ end
113
+
114
+ add :myspace do
115
+ username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
116
+ sanitizer [
117
+ %r{^(?:http://)?(?:www\.)?myspace\.com/(#{username_regexp})},
118
+ %r{^(#{username_regexp})$}
119
+ ]
120
+ formatter 'http://myspace.com/%s'
121
+ end
122
+
123
+ add :skype do
124
+ sanitizer %r{^[a-z][a-z0-9_,.\-]{5,31}$}i
125
+ formatter 'skype:%s?userinfo'
126
+ end
127
+
128
+ add :tumblr do
129
+ username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
130
+ sanitizer [
131
+ %r{^(?:http://)?(?:www\.)tumblr\.com/blog/(#{username_regexp})},
132
+ %r{^(?:http://)?(#{username_regexp})\.tumblr\.com},
133
+ %r{^(#{username_regexp})$}
134
+ ]
135
+ formatter 'http://%s.tumblr.com'
136
+ end
137
+
138
+ add :twitter do
139
+ username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
140
+ sanitizer [
141
+ %r{^(?:https?://)?(?:www\.)?twitter\.com/(?:#!/)?(#{username_regexp})},
142
+ %r{^(#{username_regexp})$}
143
+ ]
144
+ formatter 'http://twitter.com/#!/%s'
145
+ end
146
+
147
+ add :vkontakte do
148
+ id_regexp = '[0-9]{1,25}'
149
+ username_regexp = '[a-zA-Z][a-zA-Z0-9_\.]{4,}'
150
+ sanitizer [
151
+ %r{^(?:http://)?(?:www\.)?vkontakte\.ru/id(#{id_regexp})},
152
+ %r{^(#{id_regexp})$},
153
+ %r{^(?:http://)?(?:www\.)?vkontakte\.ru/(#{username_regexp})},
154
+ %r{^(#{username_regexp})$},
155
+ ]
156
+ formatter do |value|
157
+ if value[/\D/]
158
+ "http://vkontakte.ru/#{value}"
159
+ else
160
+ "http://vkontakte.ru/id#{value}"
161
+ end
162
+ end
163
+ end
164
+
165
+ add :youtube do
166
+ username_regexp = '[a-zA-Z0-9_\\-]{1,20}'
167
+ sanitizer [
168
+ %r{^(?:http://)?(?:www\.)?youtube\.com/user/(#{username_regexp})},
169
+ %r{^(#{username_regexp})$}
170
+ ]
171
+ formatter 'http://youtube.com/user/%s'
172
+ end
173
+ end
174
+
@@ -0,0 +1,45 @@
1
+ module Extractors
2
+ class Extractor
3
+ def sanitizer(arg = nil, &block)
4
+ @sanitizer = arg || block
5
+ end
6
+
7
+ def sanitize(value)
8
+ case @sanitizer
9
+ when Regexp
10
+ value_from_matching_regexp(value, [@sanitizer])
11
+ when Array
12
+ value_from_matching_regexp(value, @sanitizer)
13
+ when Proc
14
+ @sanitizer.call(value)
15
+ when nil
16
+ value
17
+ else
18
+ raise "Unknown type of sanitizer: #{@sanitizer.inspect}"
19
+ end
20
+ end
21
+
22
+ def formatter(arg = nil, &block)
23
+ @formatter = arg || block
24
+ end
25
+
26
+ def format(value)
27
+ case @formatter
28
+ when String
29
+ @formatter % value
30
+ when Proc
31
+ @formatter.call(value)
32
+ when nil
33
+ value
34
+ else
35
+ raise "Unknown type of formatter: #{@formatter.inspect}"
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ def value_from_matching_regexp(value, regexps)
42
+ regexps.find{ |regexp| regexp === value } && $1 || $&
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module Extractors
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,81 @@
1
+ require 'spec_helper'
2
+
3
+ describe Extractors do
4
+ it "can reach extractors by a shortcut" do
5
+ Extractors(:skype).must_be_instance_of Extractors::Extractor
6
+ end
7
+
8
+ it "should have default extractors" do
9
+ Extractors.extractors.wont_be_empty
10
+ Extractors.extractors.keys.each do |extractor|
11
+ Extractors.extractors[extractor].must_be_instance_of Extractors::Extractor
12
+ end
13
+ end
14
+
15
+ it "can add new extractors" do
16
+ Extractors.add :test do
17
+ end
18
+ extractor = Extractors(:test)
19
+
20
+ extractor.wont_be_nil
21
+ extractor.must_respond_to(:sanitize)
22
+ extractor.must_respond_to(:format)
23
+ end
24
+
25
+ describe "existing extractors" do
26
+ {
27
+ :tumblr => {
28
+ :sanitize => {
29
+ "http://robotector.tumblr.com" => "robotector",
30
+ "http://www.tumblr.com/blog/robotector" => "robotector",
31
+ "robotector" => "robotector",
32
+ },
33
+ :format => {
34
+ "robotector" => "http://robotector.tumblr.com"
35
+ }
36
+ },
37
+ :lookatme => {
38
+ :sanitize => {
39
+ "lookatme.ru/users/macovsky" => "macovsky",
40
+ "macovsky" => "macovsky",
41
+ },
42
+ :format => {
43
+ "macovsky" => "http://lookatme.ru/users/macovsky"
44
+ }
45
+ },
46
+ :twitter => {
47
+ :sanitize => {
48
+ "twitter.com/robotector" => "robotector",
49
+ "https://twitter.com/#!/robotector" => "robotector",
50
+ "http://twitter.com/#!/robotector" => "robotector"
51
+ },
52
+ :format => {
53
+ "robotector" => "http://twitter.com/#!/robotector"
54
+ }
55
+ },
56
+ :googleplus => {
57
+ :sanitize => {
58
+ "https://plus.google.com/u/0/103751848505965231255/posts" => "103751848505965231255",
59
+ "https://plus.google.com/u/0/103751848505965231255" => "103751848505965231255",
60
+ "https://plus.google.com/103751848505965231255/posts" => "103751848505965231255",
61
+ },
62
+ :format => {
63
+ "103751848505965231255" => "https://plus.google.com/103751848505965231255/posts"
64
+ }
65
+ }
66
+ }.each do |name, sanitize_and_format|
67
+ (sanitize_and_format[:sanitize] || {}).merge(nil => nil).each do |k, v|
68
+ it "#{name} should sanitize properly" do
69
+ Extractors(name).sanitize(k).must_equal(v)
70
+ end
71
+ end
72
+
73
+ (sanitize_and_format[:format] || {}).each do |k, v|
74
+ it "#{name} should format properly" do
75
+ Extractors(name).format(k).must_equal(v)
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
81
+
@@ -0,0 +1,3 @@
1
+ require 'minitest/spec'
2
+ require 'minitest/autorun'
3
+ require 'extractors'
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extractors
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - macovsky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-02-22 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: minitest
16
+ requirement: &70126699361480 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70126699361480
25
+ - !ruby/object:Gem::Dependency
26
+ name: rake
27
+ requirement: &70126699357780 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70126699357780
36
+ description: ''
37
+ email:
38
+ - robotector@gmail.com
39
+ executables: []
40
+ extensions: []
41
+ extra_rdoc_files: []
42
+ files:
43
+ - .gitignore
44
+ - Gemfile
45
+ - README.markdown
46
+ - Rakefile
47
+ - extractors.gemspec
48
+ - lib/extractors.rb
49
+ - lib/extractors/extractor.rb
50
+ - lib/extractors/version.rb
51
+ - specs/extractors_spec.rb
52
+ - specs/spec_helper.rb
53
+ homepage: http://github.com/macovsky/extractors
54
+ licenses: []
55
+ post_install_message:
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ! '>='
63
+ - !ruby/object:Gem::Version
64
+ version: '0'
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ! '>='
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ requirements: []
72
+ rubyforge_project: extractors
73
+ rubygems_version: 1.8.10
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: ! 'Extract user accounts from different urls: tumblr, facebook and others.'
77
+ test_files: []