socializer-scraper 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b696c3456d065cd7a801ade53292c16484e63044
4
- data.tar.gz: 2547800d0bd1bcbe014e5d093e714e110904a1f3
3
+ metadata.gz: b7ab615984c9d9cc1c63b1f3525677f3cea346ab
4
+ data.tar.gz: 301234730dc644a79f14cde68630c581d7f11212
5
5
  SHA512:
6
- metadata.gz: a8b80c6e98fe9389318e35d55c2265b7979fa5173c122b947630b64e5aa12c7bed514974f353391873c289ac5c8726592ecc4d527e689af5fe00c662be54c90b
7
- data.tar.gz: 633af1996f78a6fe2c5c0be2f7d3007966342be622806e1b2cc8076f666a4f18b377db98d7fb706c3c50e2290a8b49eb0f0e5fa7e787d6936edd855fd110d4c7
6
+ metadata.gz: 2b9b2249243883434e2958fc8e2e42275359893955cea4badccfc2d56ed9f23023d492d02193968a59968b3fb7280d89f7bdf6d692cf6c551c1ce124943bd663
7
+ data.tar.gz: 64236e958b29d358a15103343c19aab8859e7cc80220c1fd69feaed5c7fd9589047d9075a32d17c3a94b90ba2ef8ad2ca8d578e30bd16cd947e776b732ca297b
@@ -10,6 +10,7 @@ STDOUT.sync = true
10
10
  class Socializer::Scraper::CLI < Thor
11
11
 
12
12
  desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
13
+ method_options pattern: :string, desc: "Comma separate list of patterns that selects which links to follow"
13
14
  def emails(*urls)
14
15
  extractor = Socializer::Scraper::Extractor.new collectors: [:email]
15
16
  urls.each do |website|
@@ -19,13 +20,15 @@ class Socializer::Scraper::CLI < Thor
19
20
  puts "Scraping website: #{website}"
20
21
  puts "=" * 100
21
22
 
22
- website = URI.parse("http://#{website}") unless website.start_with?("http")
23
+ website = URI.parse(website.start_with?("http") ? website : "http://#{website}")
23
24
  file = File.join(Dir.pwd, "#{website.host}.yml")
24
25
  counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
25
26
 
27
+ patterns = options.has_key?("pattern") ? options["pattern"].split(",").map{|a| Regexp.new a} : []
28
+
26
29
  extractor.url = website.to_s
27
- extractor.run do |page, collector, found|
28
- found = found.map{ |email| email.strip }.accumulate - list
30
+ extractor.run(*patterns) do |page, collector, found|
31
+ found = found.map{ |email| email.strip.downcase }.accumulate - list
29
32
  list |= found
30
33
 
31
34
  found = found.count
@@ -51,11 +54,3 @@ class Socializer::Scraper::CLI < Thor
51
54
  end
52
55
 
53
56
  Socializer::Scraper::CLI.start ARGV
54
-
55
- websites = %w[
56
- www.thegearpage.net
57
- www.hugeracksin.com
58
- www.rig-talk.com
59
- www.guitariste.com
60
- www.tonequest.com
61
- ]
@@ -79,13 +79,26 @@ module Socializer
79
79
  patterns.push(/.*/) if patterns.empty?
80
80
 
81
81
  Anemone.crawl(@url, options) do |anemone|
82
+ anemone.threads = 2
83
+ anemone.verbose = true
84
+ anemone.obey_robots_txt = true
85
+ anemone.accept_cookies = true
86
+ anemone.user_agent = "Googlebot"
82
87
  anemone.storage = Anemone::Storage.MongoDB
83
- anemone.on_pages_like(*patterns) do |page|
88
+ anemone.focus_crawl{|page| links_matching(page.links, patterns) }
89
+ anemone.on_every_page do |page|
84
90
  @page, @html, @current_url = page, nil, page.url
85
91
  yield(page)
86
92
  end
87
93
  end
88
94
  end
95
+
96
+ def links_matching links, patterns = []
97
+ return links if patterns.empty?
98
+ links.select do |link|
99
+ patterns.detect{|p| link.to_s =~ p}
100
+ end
101
+ end
89
102
  end
90
103
  end
91
104
  end
@@ -1,5 +1,5 @@
1
1
  module Socializer
2
2
  module Scraper
3
- VERSION = "0.1.0"
3
+ VERSION = "0.1.1"
4
4
  end
5
5
  end
@@ -12,3 +12,57 @@ module Socializer
12
12
  # Your code goes here...
13
13
  end
14
14
  end
15
+
16
+ module Anemone
17
+ class Core
18
+ #
19
+ # Perform the crawl
20
+ #
21
+ def run
22
+ process_options
23
+
24
+ @urls.delete_if { |url| !visit_link?(url) }
25
+ return if @urls.empty?
26
+
27
+ link_queue = Queue.new
28
+ page_queue = Queue.new
29
+
30
+ @opts[:threads].times do
31
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
32
+ end
33
+
34
+ @urls.each{ |url| link_queue.enq(url) }
35
+
36
+ loop do
37
+ page = page_queue.deq
38
+ @pages.touch_key page.url
39
+ print "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
40
+ do_page_blocks page
41
+ page.discard_doc! if @opts[:discard_page_bodies]
42
+
43
+ links = links_to_follow page
44
+ links.each do |link|
45
+ link_queue << [link, page.url.dup, page.depth + 1]
46
+ end
47
+ @pages.touch_keys links
48
+
49
+ @pages[page.url] = page
50
+
51
+ # if we are done with the crawl, tell the threads to end
52
+ if link_queue.empty? and page_queue.empty?
53
+ until link_queue.num_waiting == @tentacles.size
54
+ Thread.pass
55
+ end
56
+ if page_queue.empty?
57
+ @tentacles.size.times { link_queue << :END }
58
+ break
59
+ end
60
+ end
61
+ end
62
+
63
+ @tentacles.each { |thread| thread.join }
64
+ do_after_crawl_blocks
65
+ self
66
+ end
67
+ end
68
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: socializer-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikhil Gupta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-04 00:00:00.000000000 Z
11
+ date: 2014-04-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler