socializer-scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e3be782e7e3dd71ba2ec0b6c8cd7ffb35c65305f
4
+ data.tar.gz: f548cdcfe90f8e3370a9d3148c759fd623765a89
5
+ SHA512:
6
+ metadata.gz: 9a37596beb23d40ab660d9f5becdf875033f9344fbebfb781d4ced69dd5e98baf274a72764c26f783a3a9b2451f0e3c3422a54e247f698ad225f24778010c52a
7
+ data.tar.gz: 0d9a89741130f94df7fb993a84de6d1aba5d2d7ab290954dc5b9d4141940f778dbe88a260521b86896208a96b1ca54ad33d8865f38c74269462d231ee8d92c9a
data/.gitignore ADDED
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ data/
19
+ tags
20
+ scripts/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in socializer-scraper.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'yard' do
5
+ watch(%r{app/.+\.rb})
6
+ watch(%r{lib/.+\.rb})
7
+ watch(%r{ext/.+\.c})
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Nikhil Gupta
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Socializer::Scraper
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'socializer-scraper'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install socializer-scraper
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'socializer/scraper'
@@ -0,0 +1,84 @@
1
+ module Socializer
2
+ module Scraper
3
+ module Collector
4
+ def email_collector
5
+
6
+ at_re = "(@|\\s*[\\[|\\(|\\{](@|at)[\\]|\\)|\\}]\\s*)"
7
+ dt_re = "(\\.|\\s*[\\[|\\(|\\{](\\.|dot)[\\]|\\)|\\}]\\s*)"
8
+ regex = /([A-Z0-9._%-]+#{at_re}([A-Z0-9-]+#{dt_re})+[A-Z]{2,4})/i
9
+
10
+ emails = @page.body.scan(regex).map do |a|
11
+ "mailto:" + a[0].gsub(a[1], "@").gsub(a[4], ".")
12
+ end rescue []
13
+
14
+ (emails | page_links).map do |e|
15
+ uri = URI.parse(URI.encode(e))
16
+ uri.to if uri.respond_to?(:to)
17
+ end.compact
18
+ end
19
+
20
+ def sitemap_collector
21
+ @current_url
22
+ end
23
+
24
+ def link_collector
25
+ page_links.map do |link|
26
+ begin
27
+ uri = URI.parse(link).absolute(@url.host, @url.scheme)
28
+
29
+ case
30
+ when uri.url? && uri.host == @url.host then { internal: uri.to_s }
31
+ when uri.url? then { external: link }
32
+ when uri.scheme then { uri.scheme.to_sym => link }
33
+ else { unknown: link }
34
+ end
35
+ rescue URI::InvalidURIError
36
+ { unknown: link }
37
+ end
38
+ end.collect_as_hash
39
+ end
40
+
41
+ def live_link_collector
42
+ page_links.map do |link|
43
+ begin
44
+ uri = URI.parse(link).absolute(@url.host, @url.scheme)
45
+
46
+ case
47
+ when uri.respond_to?(:error?) && (error = uri.error?)
48
+ then { error => uri.to_s }
49
+ when uri.url? && uri.host == @url.host
50
+ then { internal: uri.to_s }
51
+ when uri.url? then { external: link }
52
+ when uri.scheme then { uri.scheme => link }
53
+ else { unknown: link }
54
+ end
55
+ rescue URI::InvalidURIError
56
+ { unknown: link }
57
+ end
58
+ end.collect_as_hash
59
+ end
60
+
61
+ def social_profile_collector options = {}
62
+ default = [ :facebook, :twitter, :github ]
63
+ required = options.select{ |k, v| v}.keys
64
+ allowed = if options.empty?
65
+ default
66
+ elsif required.any?
67
+ required - (required - default)
68
+ else
69
+ default - options.keys
70
+ end
71
+
72
+ allowed = allowed.map{ |a| { a => [] } }.collect_as_hash
73
+
74
+ allowed.hash_map do |provider|
75
+ regex = /#{provider}\.com\/[^\/]*$/
76
+ links = page_links.map do |link|
77
+ link =~ regex ? link : nil
78
+ end.accumulate
79
+ [provider, links]
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,15 @@
1
+ module Socializer
2
+ module Scraper
3
+ module Detector
4
+
5
+ def similar_pages
6
+
7
+ end
8
+
9
+ def ssl_page_exists?
10
+
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,109 @@
1
+ class Array
2
+
3
+ def hash_collection?
4
+ flatten.compact.reject{|v| v.is_a?(Hash)}.empty?
5
+ end
6
+
7
+ def collect_as_hash
8
+ raise StandardError, "Array is not a hash collection!" unless hash_collection?
9
+ flatten.compact.each_with_object(Hash.new([])) do |h1,h|
10
+ h1.each{|k,v| h[k] = (h[k] | [v]).accumulate }
11
+ end
12
+ end
13
+
14
+ def accumulate
15
+ flatten.compact.uniq
16
+ end
17
+
18
+ def hashify_or_collect
19
+ hash_collection? ? collect_as_hash : accumulate
20
+ end
21
+
22
+ def extract_options!
23
+ last.is_a?(Hash) && last.instance_of?(Hash) ? pop : {}
24
+ end
25
+ end
26
+
27
+ class Hash
28
+ def hash_map &block
29
+ Hash[self.map{|key, value| yield(key, value) }]
30
+ end
31
+
32
+ def hash_collection?
33
+ true
34
+ end
35
+
36
+ def collect_as_hash
37
+ self
38
+ end
39
+ alias :hashify_or_collect :collect_as_hash
40
+ end
41
+
42
+ class String
43
+ def url?
44
+ self =~ /^#{URI::regexp}$/
45
+ end
46
+
47
+ def blank?
48
+ strip.empty?
49
+ end
50
+ end
51
+
52
+ module URI
53
+
54
+ class Generic
55
+ def url?
56
+ %w[ http https ].include?(scheme)
57
+ end
58
+
59
+ def mail?
60
+ scheme == "mailto"
61
+ end
62
+
63
+ def absolute(host, scheme = nil)
64
+ return self unless self.scheme.nil?
65
+ path = to_s.start_with?("/") ? to_s : "/#{to_s}"
66
+ URI.parse("#{scheme.blank? ? "http" : scheme}://#{host}#{path}")
67
+ end
68
+
69
+ end
70
+
71
+ class HTTP
72
+ def error?
73
+ return :unknown unless url?
74
+ puts "Testing URL: #{self}"
75
+ req = Net::HTTP.new(host, port)
76
+ req.use_ssl = is_a?(URI::HTTPS)
77
+ res = req.request_head(path.empty? ? "/" : path)
78
+ if res.kind_of?(Net::HTTPRedirection)
79
+ URI.parse(res["location"]).absolute(host, scheme).error?
80
+ else
81
+ case
82
+ when res.code == "401" || res.code == "407" then :unauthorized
83
+ when res.code == "403" then :forbidden
84
+ when res.code == "404" then :not_found
85
+ when res.code[0] == "4" then :client_error
86
+ when res.code == "503" then :temporary_server_error
87
+ when res.code[0] == "5" then :server_error
88
+ end
89
+ end
90
+ rescue ::Errno::ENOENT, ::SocketError
91
+ :no_such_server
92
+ end
93
+ end
94
+ end
95
+
96
+ class Object
97
+ def accumulate
98
+ [ self ].accumulate
99
+ end
100
+
101
+ def blank?
102
+ obj = obj.strip if respond_to?(:strip)
103
+ obj.respond_to?(:empty?) ? obj.empty? : !obj
104
+ end
105
+
106
+ def present?
107
+ !blank?
108
+ end
109
+ end
@@ -0,0 +1,89 @@
1
+ module Socializer
2
+ module Scraper
3
+ class Extractor
4
+
5
+ include Socializer::Scraper::Collector
6
+
7
+ attr_reader :url
8
+ attr_writer :collectors
9
+
10
+ def initialize options = {}
11
+ self.url = options.fetch(:url, nil)
12
+ self.collectors = options.fetch(:collectors, [])
13
+ end
14
+
15
+ # Set the URL to crawl for this Crawler instance.
16
+ #
17
+ # @param url [string] URL or domain name to crawl.
18
+ # @return string url
19
+ def url= url
20
+ return unless url
21
+ @url = URI.parse(url)
22
+ message = "Please, provide a URL that starts with HTTP or HTTPS"
23
+ raise URI::InvalidURIError, message unless @url.url?
24
+ end
25
+
26
+ def collectors
27
+ @collectors.any? ? @collectors : self.class.available_collectors
28
+ end
29
+
30
+ def run *patterns, &block
31
+ data, options = {}, patterns.extract_options!
32
+ page_wise = options.fetch(:page_wise, false)
33
+
34
+ perform(*patterns) do |page|
35
+ collectors.each do |collector|
36
+ found = send("#{collector}_collector")
37
+ yield(page, collector, found) if block_given?
38
+ if page_wise
39
+ data[collector] ||= {}
40
+ data[collector][@current_url] = found
41
+ else
42
+ data[collector] ||= []
43
+ data[collector].push found
44
+ end
45
+ end
46
+ end
47
+
48
+ data.hash_map{|kind, list| [kind, list.hashify_or_collect]}
49
+ end
50
+
51
+ class << self
52
+ def available_collectors
53
+ self.instance_methods.select do |name|
54
+ name.to_s.end_with?("_collector")
55
+ end.map do |name|
56
+ name.to_s.gsub(/_collector$/, '').to_sym
57
+ end
58
+ end
59
+ end
60
+
61
+ protected
62
+
63
+ def page_html
64
+ @html ||= Nokogiri::HTML(@page.body)
65
+ end
66
+
67
+ def page_links
68
+ page_html.search("a").map{|a| a.attr("href")}.accumulate
69
+ end
70
+
71
+ private
72
+
73
+ def perform *patterns, &block
74
+ message = "Please, provide a URL that starts with HTTP or HTTPS"
75
+ raise URI::InvalidURIError, message unless @url.url?
76
+
77
+ patterns.push(/.*/) if patterns.empty?
78
+
79
+ Anemone.crawl(@url) do |anemone|
80
+ anemone.storage = Anemone::Storage.MongoDB
81
+ anemone.on_pages_like(*patterns) do |page|
82
+ @page, @html, @current_url = page, nil, page.url
83
+ yield(page)
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,15 @@
1
+ module Socializer
2
+ module Scraper
3
+ module Validator
4
+
5
+ def url_valid?
6
+
7
+ end
8
+
9
+ def mail_valid?
10
+
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,5 @@
1
+ module Socializer
2
+ module Scraper
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,14 @@
1
+ require "uri"
2
+
3
+ require "anemone"
4
+
5
+ require "socializer/scraper/version"
6
+ require "socializer/scraper/extensions"
7
+ require "socializer/scraper/collector"
8
+ require "socializer/scraper/extractor"
9
+
10
+ module Socializer
11
+ module Scraper
12
+ # Your code goes here...
13
+ end
14
+ end
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'socializer/scraper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "socializer-scraper"
8
+ spec.version = Socializer::Scraper::VERSION
9
+ spec.authors = ["Nikhil Gupta"]
10
+ spec.email = ["me@nikhgupta.com"]
11
+ spec.description = %q{Various scrapers for the Socializer application.}
12
+ spec.summary = %q{Various scrapers for the Socializer application.}
13
+ spec.homepage = "http://nikhgupta.com"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "yard"
25
+ spec.add_development_dependency "guard-yard"
26
+
27
+ spec.add_dependency "bson_ext"
28
+ spec.add_dependency "mongo"
29
+ spec.add_dependency "anemone"
30
+ end
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ describe Socializer::Scraper do
4
+ it 'should have a version number' do
5
+ Socializer::Scraper::VERSION.should_not be_nil
6
+ end
7
+
8
+ it 'should do something useful' do
9
+ false.should be_true
10
+ end
11
+ end
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'socializer/scraper'
metadata ADDED
@@ -0,0 +1,179 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: socializer-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nikhil Gupta
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: yard
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard-yard
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: bson_ext
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: mongo
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: anemone
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Various scrapers for the Socializer application.
126
+ email:
127
+ - me@nikhgupta.com
128
+ executables:
129
+ - socializer-scraper
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - .gitignore
134
+ - .rspec
135
+ - .travis.yml
136
+ - Gemfile
137
+ - Guardfile
138
+ - LICENSE.txt
139
+ - README.md
140
+ - Rakefile
141
+ - bin/socializer-scraper
142
+ - lib/socializer/scraper.rb
143
+ - lib/socializer/scraper/collector.rb
144
+ - lib/socializer/scraper/detector.rb
145
+ - lib/socializer/scraper/extensions.rb
146
+ - lib/socializer/scraper/extractor.rb
147
+ - lib/socializer/scraper/validator.rb
148
+ - lib/socializer/scraper/version.rb
149
+ - socializer-scraper.gemspec
150
+ - spec/socializer/scraper_spec.rb
151
+ - spec/spec_helper.rb
152
+ homepage: http://nikhgupta.com
153
+ licenses:
154
+ - MIT
155
+ metadata: {}
156
+ post_install_message:
157
+ rdoc_options: []
158
+ require_paths:
159
+ - lib
160
+ required_ruby_version: !ruby/object:Gem::Requirement
161
+ requirements:
162
+ - - '>='
163
+ - !ruby/object:Gem::Version
164
+ version: '0'
165
+ required_rubygems_version: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - '>='
168
+ - !ruby/object:Gem::Version
169
+ version: '0'
170
+ requirements: []
171
+ rubyforge_project:
172
+ rubygems_version: 2.2.2
173
+ signing_key:
174
+ specification_version: 4
175
+ summary: Various scrapers for the Socializer application.
176
+ test_files:
177
+ - spec/socializer/scraper_spec.rb
178
+ - spec/spec_helper.rb
179
+ has_rdoc: