socializer-scraper 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bc110e79790596f048e8745e0d346317c0ef1c0a
4
- data.tar.gz: 4d084ed27c23b9218f808a8084ed6accae5462be
3
+ metadata.gz: dca36b51da99ac68ef2d7f6dc62fb48c46ee015b
4
+ data.tar.gz: 03e55384ed88979807a2bddc02acd02082b31d82
5
5
  SHA512:
6
- metadata.gz: df9d0eb4c18b6c1e4f0bff4abd2c93b0ebb02af8567cda2d14550734d466fdafd6b47e3848def697d73bd6b7a9b64eff306b8db2583f1fcd6251d55d3f6fe57a
7
- data.tar.gz: 258073993bb483525cbe1a2c39f521bac12f8d3a4be58f7d1d48d1b299696c1c5ae19a5982308b985bbc5358c990b5b9ad56ca97aa42d4c2ad5c47a74be2e3f9
6
+ metadata.gz: e07091135c2fbdafe8626dea0145119522cc2262df87459fa95b526c89f91a78d046d2a868cd853a69f4c72b487aa89e07473346b15f2e0bfab907a94f4a92b2
7
+ data.tar.gz: fa6185bc3d4700f4b65317cbc94f328b7cd27d8312c5622d8d2a5f21212dede0a6a341fc86df699d2f4845b0043719e7659399595163d0f9a54825866e783ec1
data/.gitignore CHANGED
@@ -18,3 +18,4 @@ tmp
18
18
  data/
19
19
  tags
20
20
  scripts/
21
+ *.yml
@@ -1,3 +1,58 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'yaml'
4
+ require 'thor'
5
+ require 'fileutils'
3
6
  require 'socializer/scraper'
7
+
8
+ class Socializer::Scraper::CLI < Thor
9
+
10
+ desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
11
+ def emails(*urls)
12
+ extractor = Socializer::Scraper::Extractor.new collectors: [:email]
13
+ urls.each do |website|
14
+
15
+ puts "=" * 100
16
+ puts "Current Time is : #{Time.now.utc}"
17
+ puts "Scraping website: #{website}"
18
+ puts "=" * 100
19
+
20
+ file = File.join(Dir.pwd, "#{website}.yml")
21
+ counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
22
+
23
+ extractor.url = "http://" + website unless website.start_with?("http")
24
+ extractor.run do |page, collector, found|
25
+ found = found.map{ |email| email.strip }.accumulate - list
26
+ list |= found
27
+
28
+ found = found.count
29
+ found = "+" if found > 9
30
+ found = "." if found < 1
31
+
32
+ if counter % 100 == 99
33
+ File.open(file, "w") { |f| f.puts list.to_yaml }
34
+ puts found
35
+ else
36
+ print found
37
+ end
38
+
39
+ counter += 1
40
+ end
41
+
42
+ puts "=" * 100
43
+ puts "Finish Time is : #{Time.now.utc}"
44
+ puts "Emails Found : #{list.count}"
45
+ end
46
+ end
47
+
48
+ end
49
+
50
+ Socializer::Scraper::CLI.start ARGV
51
+
52
+ websites = %w[
53
+ www.thegearpage.net
54
+ www.hugeracksin.com
55
+ www.rig-talk.com
56
+ www.guitariste.com
57
+ www.tonequest.com
58
+ ]
@@ -1,5 +1,5 @@
1
1
  module Socializer
2
2
  module Scraper
3
- VERSION = "0.0.2"
3
+ VERSION = "0.0.3"
4
4
  end
5
5
  end
@@ -24,7 +24,8 @@ Gem::Specification.new do |spec|
24
24
  spec.add_development_dependency "yard"
25
25
  spec.add_development_dependency "guard-yard"
26
26
 
27
- spec.add_dependency "bson_ext"
27
+ spec.add_dependency "thor"
28
28
  spec.add_dependency "mongo"
29
29
  spec.add_dependency "anemone"
30
+ spec.add_dependency "bson_ext"
30
31
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: socializer-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikhil Gupta
@@ -81,7 +81,7 @@ dependencies:
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
- name: bson_ext
84
+ name: thor
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - '>='
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - '>='
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: bson_ext
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '>='
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
125
139
  description: Various scrapers for the Socializer application.
126
140
  email:
127
141
  - me@nikhgupta.com