extractors 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in extractors.gemspec
4
+ gemspec
@@ -0,0 +1,45 @@
1
+ ### Extract user accounts from different contexts...
2
+
3
+ ```ruby
4
+ Extractors(:twitter).sanitize("http://twitter.com/#!/programmingshit") # => "programmingshit"
5
+ Extractors(:twitter).sanitize("programmingshit") # => "programmingshit"
6
+ ```
7
+
8
+ ### ... and format them back
9
+
10
+ ```ruby
11
+ Extractors(:twitter).format("programmingshit") # => "http://twitter.com/#!/programmingshit"
12
+ ```
13
+
14
+ ### Add your own extractors
15
+
16
+ ```ruby
17
+ Extractors.add :lookatme do
18
+ username_regexp = '[a-zA-Z0-9_\\-]{3,20}'
19
+ sanitizer [
20
+ %r{^(?:http://)?(?:www\.)?lookatme\.ru/users/(#{username_regexp})},
21
+ %r{^(#{username_regexp})$}
22
+ ]
23
+ formatter 'http://lookatme.ru/users/%s'
24
+ end
25
+ ```
26
+
27
+ ### List of available extractors
28
+
29
+ * :facebook
30
+ * :googleplus
31
+ * :gtalk
32
+ * :lastfm
33
+ * :livejournal
34
+ * :lookatme
35
+ * :moikrug
36
+ * :myspace
37
+ * :skype
38
+ * :tumblr
39
+ * :twitter
40
+ * :vkontakte
41
+ * :youtube
42
+
43
+ ### Credits
44
+
45
+ The code itself has been *extracted* from <http://github.com/toy/contacts> gem.
@@ -0,0 +1,9 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ task :default => :test
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'lib' << 'specs'
7
+ t.test_files = FileList['specs/*_spec.rb']
8
+ t.verbose = true
9
+ end
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "extractors/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "extractors"
7
+ s.version = Extractors::VERSION
8
+ s.authors = ["macovsky"]
9
+ s.email = ["robotector@gmail.com"]
10
+ s.homepage = "http://github.com/macovsky/extractors"
11
+ s.summary = %q{Extract user accounts from different urls: tumblr, facebook and others.}
12
+ s.description = %q{}
13
+
14
+ s.rubyforge_project = "extractors"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "minitest"
23
+ s.add_development_dependency "rake"
24
+ end
@@ -0,0 +1,174 @@
1
+ require 'extractors/version'
2
+ require 'extractors/extractor'
3
+
4
+ def Extractors (name)
5
+ Extractors.extractors[name.to_sym]
6
+ end
7
+
8
+ module Extractors
9
+ class <<self
10
+ def add(name, &block)
11
+ extractor = Extractor.new
12
+ extractor.instance_eval(&block)
13
+ extractors[name.to_sym] = extractor
14
+ end
15
+
16
+ def extractors
17
+ @extractors ||= {}
18
+ end
19
+ end
20
+
21
+ add :facebook do
22
+ username_regexp = '[0-9]{1,25}'
23
+ literal_username_regexp = '[a-zA-Z.]{1,25}'
24
+ sanitizer [
25
+ %r{^(?:http://)?(?:www\.)?facebook\.com/(#{literal_username_regexp})},
26
+ %r{^(#{literal_username_regexp})$},
27
+ %r{^(?:http://)?(?:www\.)?facebook\.com/profile.php\?id=(#{username_regexp})},
28
+ %r{^(?:http://)?(?:www\.)?facebook\.com/group.php\?gid=(#{username_regexp})},
29
+ %r{^(#{username_regexp})$}
30
+ ]
31
+ formatter do |value|
32
+ if value[%r{^\d+$}]
33
+ "http://facebook.com/profile.php?id=#{value}"
34
+ else
35
+ "http://facebook.com/#{value}"
36
+ end
37
+ end
38
+ end
39
+
40
+ # add :flickr do
41
+ # username_regexp = '[\\-a-zA-Z0-9_@]{1,50}'
42
+ # sanitizer [
43
+ # %r{^(?:http://)?(?:www\.)?flickr\.com/(?:photos|people)/(#{username_regexp})},
44
+ # %r{^(#{username_regexp})$}
45
+ # ]
46
+ # formatter 'http://flickr.com/photos/%s'
47
+ # end
48
+
49
+ add :googleplus do
50
+ sanitizer %r{^(?:https?://)?plus.google.com/(?:u/\d/)?(\d+)}
51
+ formatter 'https://plus.google.com/%s/posts'
52
+ end
53
+
54
+ add :gtalk do
55
+ sanitizer do |value|
56
+ username_regexp = '[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)*'
57
+ if result = value[%r{^(#{username_regexp})(?:@gmail\.com)?$}, 1]
58
+ "#{result}@gmail.com" if result && 6..30 === result.length
59
+ elsif result = value[%r{^#{username_regexp}@[a-z0-9]+(\.[a-z0-9]+)+$}]
60
+ "#{result}" if result
61
+ end
62
+ end
63
+ formatter 'gtalk:chat?jid=%s'
64
+ end
65
+
66
+ # add :icq do
67
+ # sanitizer %r{^\d+$}
68
+ # formatter 'http://icq.com/%s'
69
+ # end
70
+
71
+ add :lastfm do
72
+ username_regexp = '[a-zA-Z][_a-zA-Z0-9\\-]{1,20}'
73
+ sanitizer [
74
+ %r{^(?:http://)?(?:www\.)?last\.fm/user/(#{username_regexp})},
75
+ %r{^(#{username_regexp})$}
76
+ ]
77
+ formatter 'http://last.fm/user/%s'
78
+ end
79
+
80
+ add :livejournal do
81
+ sanitizer do |value|
82
+ username_regexp = '[a-zA-Z0-9_\\-]{1,20}'
83
+ result = value_from_matching_regexp(value, [
84
+ %r{^(?:http://)?(?:users|community)\.livejournal\.com/(#{username_regexp})},
85
+ %r{^(?:http://)?(#{username_regexp})\.livejournal\.com},
86
+ %r{^(#{username_regexp})$}
87
+ ])
88
+ result.gsub('_', '-') if result && result != 'www'
89
+ end
90
+ formatter 'http://%s.livejournal.com/'
91
+ end
92
+
93
+ add :lookatme do
94
+ username_regexp = '[a-zA-Z0-9_\\-]{3,20}'
95
+ sanitizer [
96
+ %r{^(?:http://)?(?:www\.)?lookatme\.ru/users/(#{username_regexp})},
97
+ %r{^(#{username_regexp})$}
98
+ ]
99
+ formatter 'http://lookatme.ru/users/%s'
100
+ end
101
+
102
+ add :moikrug do
103
+ sanitizer do |value|
104
+ username_regexp = '[a-zA-Z0-9][a-zA-Z0-9_\\-]{1,20}'
105
+ result = value_from_matching_regexp(value, [
106
+ %r{^(?:http://)?(#{username_regexp})\.moikrug\.ru},
107
+ %r{^(#{username_regexp})$}
108
+ ])
109
+ result.gsub('_', '-') if result && result != 'www'
110
+ end
111
+ formatter 'http://%s.moikrug.ru/'
112
+ end
113
+
114
+ add :myspace do
115
+ username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
116
+ sanitizer [
117
+ %r{^(?:http://)?(?:www\.)?myspace\.com/(#{username_regexp})},
118
+ %r{^(#{username_regexp})$}
119
+ ]
120
+ formatter 'http://myspace.com/%s'
121
+ end
122
+
123
+ add :skype do
124
+ sanitizer %r{^[a-z][a-z0-9_,.\-]{5,31}$}i
125
+ formatter 'skype:%s?userinfo'
126
+ end
127
+
128
+ add :tumblr do
129
+ username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
130
+ sanitizer [
131
+ %r{^(?:http://)?(?:www\.)tumblr\.com/blog/(#{username_regexp})},
132
+ %r{^(?:http://)?(#{username_regexp})\.tumblr\.com},
133
+ %r{^(#{username_regexp})$}
134
+ ]
135
+ formatter 'http://%s.tumblr.com'
136
+ end
137
+
138
+ add :twitter do
139
+ username_regexp = '[a-zA-Z0-9_\\-]{1,25}'
140
+ sanitizer [
141
+ %r{^(?:https?://)?(?:www\.)?twitter\.com/(?:#!/)?(#{username_regexp})},
142
+ %r{^(#{username_regexp})$}
143
+ ]
144
+ formatter 'http://twitter.com/#!/%s'
145
+ end
146
+
147
+ add :vkontakte do
148
+ id_regexp = '[0-9]{1,25}'
149
+ username_regexp = '[a-zA-Z][a-zA-Z0-9_\.]{4,}'
150
+ sanitizer [
151
+ %r{^(?:http://)?(?:www\.)?vkontakte\.ru/id(#{id_regexp})},
152
+ %r{^(#{id_regexp})$},
153
+ %r{^(?:http://)?(?:www\.)?vkontakte\.ru/(#{username_regexp})},
154
+ %r{^(#{username_regexp})$},
155
+ ]
156
+ formatter do |value|
157
+ if value[/\D/]
158
+ "http://vkontakte.ru/#{value}"
159
+ else
160
+ "http://vkontakte.ru/id#{value}"
161
+ end
162
+ end
163
+ end
164
+
165
+ add :youtube do
166
+ username_regexp = '[a-zA-Z0-9_\\-]{1,20}'
167
+ sanitizer [
168
+ %r{^(?:http://)?(?:www\.)?youtube\.com/user/(#{username_regexp})},
169
+ %r{^(#{username_regexp})$}
170
+ ]
171
+ formatter 'http://youtube.com/user/%s'
172
+ end
173
+ end
174
+
@@ -0,0 +1,45 @@
1
+ module Extractors
2
+ class Extractor
3
+ def sanitizer(arg = nil, &block)
4
+ @sanitizer = arg || block
5
+ end
6
+
7
+ def sanitize(value)
8
+ case @sanitizer
9
+ when Regexp
10
+ value_from_matching_regexp(value, [@sanitizer])
11
+ when Array
12
+ value_from_matching_regexp(value, @sanitizer)
13
+ when Proc
14
+ @sanitizer.call(value)
15
+ when nil
16
+ value
17
+ else
18
+ raise "Unknown type of sanitizer: #{@sanitizer.inspect}"
19
+ end
20
+ end
21
+
22
+ def formatter(arg = nil, &block)
23
+ @formatter = arg || block
24
+ end
25
+
26
+ def format(value)
27
+ case @formatter
28
+ when String
29
+ @formatter % value
30
+ when Proc
31
+ @formatter.call(value)
32
+ when nil
33
+ value
34
+ else
35
+ raise "Unknown type of formatter: #{@formatter.inspect}"
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ def value_from_matching_regexp(value, regexps)
42
+ regexps.find{ |regexp| regexp === value } && $1 || $&
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module Extractors
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,81 @@
1
+ require 'spec_helper'
2
+
3
+ describe Extractors do
4
+ it "can reach extractors by a shortcut" do
5
+ Extractors(:skype).must_be_instance_of Extractors::Extractor
6
+ end
7
+
8
+ it "should have default extractors" do
9
+ Extractors.extractors.wont_be_empty
10
+ Extractors.extractors.keys.each do |extractor|
11
+ Extractors.extractors[extractor].must_be_instance_of Extractors::Extractor
12
+ end
13
+ end
14
+
15
+ it "can add new extractors" do
16
+ Extractors.add :test do
17
+ end
18
+ extractor = Extractors(:test)
19
+
20
+ extractor.wont_be_nil
21
+ extractor.must_respond_to(:sanitize)
22
+ extractor.must_respond_to(:format)
23
+ end
24
+
25
+ describe "existing extractors" do
26
+ {
27
+ :tumblr => {
28
+ :sanitize => {
29
+ "http://robotector.tumblr.com" => "robotector",
30
+ "http://www.tumblr.com/blog/robotector" => "robotector",
31
+ "robotector" => "robotector",
32
+ },
33
+ :format => {
34
+ "robotector" => "http://robotector.tumblr.com"
35
+ }
36
+ },
37
+ :lookatme => {
38
+ :sanitize => {
39
+ "lookatme.ru/users/macovsky" => "macovsky",
40
+ "macovsky" => "macovsky",
41
+ },
42
+ :format => {
43
+ "macovsky" => "http://lookatme.ru/users/macovsky"
44
+ }
45
+ },
46
+ :twitter => {
47
+ :sanitize => {
48
+ "twitter.com/robotector" => "robotector",
49
+ "https://twitter.com/#!/robotector" => "robotector",
50
+ "http://twitter.com/#!/robotector" => "robotector"
51
+ },
52
+ :format => {
53
+ "robotector" => "http://twitter.com/#!/robotector"
54
+ }
55
+ },
56
+ :googleplus => {
57
+ :sanitize => {
58
+ "https://plus.google.com/u/0/103751848505965231255/posts" => "103751848505965231255",
59
+ "https://plus.google.com/u/0/103751848505965231255" => "103751848505965231255",
60
+ "https://plus.google.com/103751848505965231255/posts" => "103751848505965231255",
61
+ },
62
+ :format => {
63
+ "103751848505965231255" => "https://plus.google.com/103751848505965231255/posts"
64
+ }
65
+ }
66
+ }.each do |name, sanitize_and_format|
67
+ (sanitize_and_format[:sanitize] || {}).merge(nil => nil).each do |k, v|
68
+ it "#{name} should sanitize properly" do
69
+ Extractors(name).sanitize(k).must_equal(v)
70
+ end
71
+ end
72
+
73
+ (sanitize_and_format[:format] || {}).each do |k, v|
74
+ it "#{name} should format properly" do
75
+ Extractors(name).format(k).must_equal(v)
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
81
+
@@ -0,0 +1,3 @@
1
+ require 'minitest/spec'
2
+ require 'minitest/autorun'
3
+ require 'extractors'
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extractors
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - macovsky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-02-22 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: minitest
16
+ requirement: &70126699361480 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70126699361480
25
+ - !ruby/object:Gem::Dependency
26
+ name: rake
27
+ requirement: &70126699357780 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70126699357780
36
+ description: ''
37
+ email:
38
+ - robotector@gmail.com
39
+ executables: []
40
+ extensions: []
41
+ extra_rdoc_files: []
42
+ files:
43
+ - .gitignore
44
+ - Gemfile
45
+ - README.markdown
46
+ - Rakefile
47
+ - extractors.gemspec
48
+ - lib/extractors.rb
49
+ - lib/extractors/extractor.rb
50
+ - lib/extractors/version.rb
51
+ - specs/extractors_spec.rb
52
+ - specs/spec_helper.rb
53
+ homepage: http://github.com/macovsky/extractors
54
+ licenses: []
55
+ post_install_message:
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ! '>='
63
+ - !ruby/object:Gem::Version
64
+ version: '0'
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ! '>='
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ requirements: []
72
+ rubyforge_project: extractors
73
+ rubygems_version: 1.8.10
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: ! 'Extract user accounts from different urls: tumblr, facebook and others.'
77
+ test_files: []