socializer-scraper 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bc110e79790596f048e8745e0d346317c0ef1c0a
4
- data.tar.gz: 4d084ed27c23b9218f808a8084ed6accae5462be
3
+ metadata.gz: dca36b51da99ac68ef2d7f6dc62fb48c46ee015b
4
+ data.tar.gz: 03e55384ed88979807a2bddc02acd02082b31d82
5
5
  SHA512:
6
- metadata.gz: df9d0eb4c18b6c1e4f0bff4abd2c93b0ebb02af8567cda2d14550734d466fdafd6b47e3848def697d73bd6b7a9b64eff306b8db2583f1fcd6251d55d3f6fe57a
7
- data.tar.gz: 258073993bb483525cbe1a2c39f521bac12f8d3a4be58f7d1d48d1b299696c1c5ae19a5982308b985bbc5358c990b5b9ad56ca97aa42d4c2ad5c47a74be2e3f9
6
+ metadata.gz: e07091135c2fbdafe8626dea0145119522cc2262df87459fa95b526c89f91a78d046d2a868cd853a69f4c72b487aa89e07473346b15f2e0bfab907a94f4a92b2
7
+ data.tar.gz: fa6185bc3d4700f4b65317cbc94f328b7cd27d8312c5622d8d2a5f21212dede0a6a341fc86df699d2f4845b0043719e7659399595163d0f9a54825866e783ec1
data/.gitignore CHANGED
@@ -18,3 +18,4 @@ tmp
18
18
  data/
19
19
  tags
20
20
  scripts/
21
+ *.yml
@@ -1,3 +1,58 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'yaml'
4
+ require 'thor'
5
+ require 'fileutils'
3
6
  require 'socializer/scraper'
7
+
8
+ class Socializer::Scraper::CLI < Thor
9
+
10
+ desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
11
+ def emails(*urls)
12
+ extractor = Socializer::Scraper::Extractor.new collectors: [:email]
13
+ urls.each do |website|
14
+
15
+ puts "=" * 100
16
+ puts "Current Time is : #{Time.now.utc}"
17
+ puts "Scraping website: #{website}"
18
+ puts "=" * 100
19
+
20
+ file = File.join(Dir.pwd, "#{website}.yml")
21
+ counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
22
+
23
+ extractor.url = "http://" + website unless website.start_with?("http")
24
+ extractor.run do |page, collector, found|
25
+ found = found.map{ |email| email.strip }.accumulate - list
26
+ list |= found
27
+
28
+ found = found.count
29
+ found = "+" if found > 9
30
+ found = "." if found < 1
31
+
32
+ if counter % 100 == 99
33
+ File.open(file, "w") { |f| f.puts list.to_yaml }
34
+ puts found
35
+ else
36
+ print found
37
+ end
38
+
39
+ counter += 1
40
+ end
41
+
42
+ puts "=" * 100
43
+ puts "Finish Time is : #{Time.now.utc}"
44
+ puts "Emails Found : #{list.count}"
45
+ end
46
+ end
47
+
48
+ end
49
+
50
+ Socializer::Scraper::CLI.start ARGV
51
+
52
+ websites = %w[
53
+ www.thegearpage.net
54
+ www.hugeracksin.com
55
+ www.rig-talk.com
56
+ www.guitariste.com
57
+ www.tonequest.com
58
+ ]
@@ -1,5 +1,5 @@
1
1
  module Socializer
2
2
  module Scraper
3
- VERSION = "0.0.2"
3
+ VERSION = "0.0.3"
4
4
  end
5
5
  end
@@ -24,7 +24,8 @@ Gem::Specification.new do |spec|
24
24
  spec.add_development_dependency "yard"
25
25
  spec.add_development_dependency "guard-yard"
26
26
 
27
- spec.add_dependency "bson_ext"
27
+ spec.add_dependency "thor"
28
28
  spec.add_dependency "mongo"
29
29
  spec.add_dependency "anemone"
30
+ spec.add_dependency "bson_ext"
30
31
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: socializer-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikhil Gupta
@@ -81,7 +81,7 @@ dependencies:
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
- name: bson_ext
84
+ name: thor
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - '>='
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - '>='
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: bson_ext
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '>='
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
125
139
  description: Various scrapers for the Socializer application.
126
140
  email:
127
141
  - me@nikhgupta.com