socializer-scraper 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/bin/socializer-scraper +55 -0
- data/lib/socializer/scraper/version.rb +1 -1
- data/socializer-scraper.gemspec +2 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dca36b51da99ac68ef2d7f6dc62fb48c46ee015b
|
4
|
+
data.tar.gz: 03e55384ed88979807a2bddc02acd02082b31d82
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e07091135c2fbdafe8626dea0145119522cc2262df87459fa95b526c89f91a78d046d2a868cd853a69f4c72b487aa89e07473346b15f2e0bfab907a94f4a92b2
|
7
|
+
data.tar.gz: fa6185bc3d4700f4b65317cbc94f328b7cd27d8312c5622d8d2a5f21212dede0a6a341fc86df699d2f4845b0043719e7659399595163d0f9a54825866e783ec1
|
data/.gitignore
CHANGED
data/bin/socializer-scraper
CHANGED
@@ -1,3 +1,58 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'yaml'
|
4
|
+
require 'thor'
|
5
|
+
require 'fileutils'
|
3
6
|
require 'socializer/scraper'
|
7
|
+
|
8
|
+
class Socializer::Scraper::CLI < Thor
|
9
|
+
|
10
|
+
desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
|
11
|
+
def emails(*urls)
|
12
|
+
extractor = Socializer::Scraper::Extractor.new collectors: [:email]
|
13
|
+
urls.each do |website|
|
14
|
+
|
15
|
+
puts "=" * 100
|
16
|
+
puts "Current Time is : #{Time.now.utc}"
|
17
|
+
puts "Scraping website: #{website}"
|
18
|
+
puts "=" * 100
|
19
|
+
|
20
|
+
file = File.join(Dir.pwd, "#{website}.yml")
|
21
|
+
counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
|
22
|
+
|
23
|
+
extractor.url = "http://" + website unless website.start_with?("http")
|
24
|
+
extractor.run do |page, collector, found|
|
25
|
+
found = found.map{ |email| email.strip }.accumulate - list
|
26
|
+
list |= found
|
27
|
+
|
28
|
+
found = found.count
|
29
|
+
found = "+" if found > 9
|
30
|
+
found = "." if found < 1
|
31
|
+
|
32
|
+
if counter % 100 == 99
|
33
|
+
File.open(file, "w") { |f| f.puts list.to_yaml }
|
34
|
+
puts found
|
35
|
+
else
|
36
|
+
print found
|
37
|
+
end
|
38
|
+
|
39
|
+
counter += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
puts "=" * 100
|
43
|
+
puts "Finish Time is : #{Time.now.utc}"
|
44
|
+
puts "Emails Found : #{list.count}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
Socializer::Scraper::CLI.start ARGV
|
51
|
+
|
52
|
+
websites = %w[
|
53
|
+
www.thegearpage.net
|
54
|
+
www.hugeracksin.com
|
55
|
+
www.rig-talk.com
|
56
|
+
www.guitariste.com
|
57
|
+
www.tonequest.com
|
58
|
+
]
|
data/socializer-scraper.gemspec
CHANGED
@@ -24,7 +24,8 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_development_dependency "yard"
|
25
25
|
spec.add_development_dependency "guard-yard"
|
26
26
|
|
27
|
-
spec.add_dependency "
|
27
|
+
spec.add_dependency "thor"
|
28
28
|
spec.add_dependency "mongo"
|
29
29
|
spec.add_dependency "anemone"
|
30
|
+
spec.add_dependency "bson_ext"
|
30
31
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: socializer-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikhil Gupta
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: thor
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - '>='
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - '>='
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: bson_ext
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
description: Various scrapers for the Socializer application.
|
126
140
|
email:
|
127
141
|
- me@nikhgupta.com
|