socializer-scraper 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/bin/socializer-scraper +55 -0
- data/lib/socializer/scraper/version.rb +1 -1
- data/socializer-scraper.gemspec +2 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dca36b51da99ac68ef2d7f6dc62fb48c46ee015b
|
4
|
+
data.tar.gz: 03e55384ed88979807a2bddc02acd02082b31d82
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e07091135c2fbdafe8626dea0145119522cc2262df87459fa95b526c89f91a78d046d2a868cd853a69f4c72b487aa89e07473346b15f2e0bfab907a94f4a92b2
|
7
|
+
data.tar.gz: fa6185bc3d4700f4b65317cbc94f328b7cd27d8312c5622d8d2a5f21212dede0a6a341fc86df699d2f4845b0043719e7659399595163d0f9a54825866e783ec1
|
data/.gitignore
CHANGED
data/bin/socializer-scraper
CHANGED
@@ -1,3 +1,58 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'yaml'
|
4
|
+
require 'thor'
|
5
|
+
require 'fileutils'
|
3
6
|
require 'socializer/scraper'
|
7
|
+
|
8
|
+
class Socializer::Scraper::CLI < Thor
|
9
|
+
|
10
|
+
desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
|
11
|
+
def emails(*urls)
|
12
|
+
extractor = Socializer::Scraper::Extractor.new collectors: [:email]
|
13
|
+
urls.each do |website|
|
14
|
+
|
15
|
+
puts "=" * 100
|
16
|
+
puts "Current Time is : #{Time.now.utc}"
|
17
|
+
puts "Scraping website: #{website}"
|
18
|
+
puts "=" * 100
|
19
|
+
|
20
|
+
file = File.join(Dir.pwd, "#{website}.yml")
|
21
|
+
counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
|
22
|
+
|
23
|
+
extractor.url = "http://" + website unless website.start_with?("http")
|
24
|
+
extractor.run do |page, collector, found|
|
25
|
+
found = found.map{ |email| email.strip }.accumulate - list
|
26
|
+
list |= found
|
27
|
+
|
28
|
+
found = found.count
|
29
|
+
found = "+" if found > 9
|
30
|
+
found = "." if found < 1
|
31
|
+
|
32
|
+
if counter % 100 == 99
|
33
|
+
File.open(file, "w") { |f| f.puts list.to_yaml }
|
34
|
+
puts found
|
35
|
+
else
|
36
|
+
print found
|
37
|
+
end
|
38
|
+
|
39
|
+
counter += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
puts "=" * 100
|
43
|
+
puts "Finish Time is : #{Time.now.utc}"
|
44
|
+
puts "Emails Found : #{list.count}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
Socializer::Scraper::CLI.start ARGV
|
51
|
+
|
52
|
+
websites = %w[
|
53
|
+
www.thegearpage.net
|
54
|
+
www.hugeracksin.com
|
55
|
+
www.rig-talk.com
|
56
|
+
www.guitariste.com
|
57
|
+
www.tonequest.com
|
58
|
+
]
|
data/socializer-scraper.gemspec
CHANGED
@@ -24,7 +24,8 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_development_dependency "yard"
|
25
25
|
spec.add_development_dependency "guard-yard"
|
26
26
|
|
27
|
-
spec.add_dependency "
|
27
|
+
spec.add_dependency "thor"
|
28
28
|
spec.add_dependency "mongo"
|
29
29
|
spec.add_dependency "anemone"
|
30
|
+
spec.add_dependency "bson_ext"
|
30
31
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: socializer-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikhil Gupta
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: thor
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - '>='
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - '>='
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: bson_ext
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
description: Various scrapers for the Socializer application.
|
126
140
|
email:
|
127
141
|
- me@nikhgupta.com
|