socializer-scraper 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b696c3456d065cd7a801ade53292c16484e63044
4
- data.tar.gz: 2547800d0bd1bcbe014e5d093e714e110904a1f3
3
+ metadata.gz: b7ab615984c9d9cc1c63b1f3525677f3cea346ab
4
+ data.tar.gz: 301234730dc644a79f14cde68630c581d7f11212
5
5
  SHA512:
6
- metadata.gz: a8b80c6e98fe9389318e35d55c2265b7979fa5173c122b947630b64e5aa12c7bed514974f353391873c289ac5c8726592ecc4d527e689af5fe00c662be54c90b
7
- data.tar.gz: 633af1996f78a6fe2c5c0be2f7d3007966342be622806e1b2cc8076f666a4f18b377db98d7fb706c3c50e2290a8b49eb0f0e5fa7e787d6936edd855fd110d4c7
6
+ metadata.gz: 2b9b2249243883434e2958fc8e2e42275359893955cea4badccfc2d56ed9f23023d492d02193968a59968b3fb7280d89f7bdf6d692cf6c551c1ce124943bd663
7
+ data.tar.gz: 64236e958b29d358a15103343c19aab8859e7cc80220c1fd69feaed5c7fd9589047d9075a32d17c3a94b90ba2ef8ad2ca8d578e30bd16cd947e776b732ca297b
@@ -10,6 +10,7 @@ STDOUT.sync = true
10
10
  class Socializer::Scraper::CLI < Thor
11
11
 
12
12
  desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
13
+ method_options pattern: :string, desc: "Comma separate list of patterns that selects which links to follow"
13
14
  def emails(*urls)
14
15
  extractor = Socializer::Scraper::Extractor.new collectors: [:email]
15
16
  urls.each do |website|
@@ -19,13 +20,15 @@ class Socializer::Scraper::CLI < Thor
19
20
  puts "Scraping website: #{website}"
20
21
  puts "=" * 100
21
22
 
22
- website = URI.parse("http://#{website}") unless website.start_with?("http")
23
+ website = URI.parse(website.start_with?("http") ? website : "http://#{website}")
23
24
  file = File.join(Dir.pwd, "#{website.host}.yml")
24
25
  counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
25
26
 
27
+ patterns = options.has_key?("pattern") ? options["pattern"].split(",").map{|a| Regexp.new a} : []
28
+
26
29
  extractor.url = website.to_s
27
- extractor.run do |page, collector, found|
28
- found = found.map{ |email| email.strip }.accumulate - list
30
+ extractor.run(*patterns) do |page, collector, found|
31
+ found = found.map{ |email| email.strip.downcase }.accumulate - list
29
32
  list |= found
30
33
 
31
34
  found = found.count
@@ -51,11 +54,3 @@ class Socializer::Scraper::CLI < Thor
51
54
  end
52
55
 
53
56
  Socializer::Scraper::CLI.start ARGV
54
-
55
- websites = %w[
56
- www.thegearpage.net
57
- www.hugeracksin.com
58
- www.rig-talk.com
59
- www.guitariste.com
60
- www.tonequest.com
61
- ]
@@ -79,13 +79,26 @@ module Socializer
79
79
  patterns.push(/.*/) if patterns.empty?
80
80
 
81
81
  Anemone.crawl(@url, options) do |anemone|
82
+ anemone.threads = 2
83
+ anemone.verbose = true
84
+ anemone.obey_robots_txt = true
85
+ anemone.accept_cookies = true
86
+ anemone.user_agent = "Googlebot"
82
87
  anemone.storage = Anemone::Storage.MongoDB
83
- anemone.on_pages_like(*patterns) do |page|
88
+ anemone.focus_crawl{|page| links_matching(page.links, patterns) }
89
+ anemone.on_every_page do |page|
84
90
  @page, @html, @current_url = page, nil, page.url
85
91
  yield(page)
86
92
  end
87
93
  end
88
94
  end
95
+
96
+ def links_matching links, patterns = []
97
+ return links if patterns.empty?
98
+ links.select do |link|
99
+ patterns.detect{|p| link.to_s =~ p}
100
+ end
101
+ end
89
102
  end
90
103
  end
91
104
  end
@@ -1,5 +1,5 @@
1
1
  module Socializer
2
2
  module Scraper
3
- VERSION = "0.1.0"
3
+ VERSION = "0.1.1"
4
4
  end
5
5
  end
@@ -12,3 +12,57 @@ module Socializer
12
12
  # Your code goes here...
13
13
  end
14
14
  end
15
+
16
+ module Anemone
17
+ class Core
18
+ #
19
+ # Perform the crawl
20
+ #
21
+ def run
22
+ process_options
23
+
24
+ @urls.delete_if { |url| !visit_link?(url) }
25
+ return if @urls.empty?
26
+
27
+ link_queue = Queue.new
28
+ page_queue = Queue.new
29
+
30
+ @opts[:threads].times do
31
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
32
+ end
33
+
34
+ @urls.each{ |url| link_queue.enq(url) }
35
+
36
+ loop do
37
+ page = page_queue.deq
38
+ @pages.touch_key page.url
39
+ print "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
40
+ do_page_blocks page
41
+ page.discard_doc! if @opts[:discard_page_bodies]
42
+
43
+ links = links_to_follow page
44
+ links.each do |link|
45
+ link_queue << [link, page.url.dup, page.depth + 1]
46
+ end
47
+ @pages.touch_keys links
48
+
49
+ @pages[page.url] = page
50
+
51
+ # if we are done with the crawl, tell the threads to end
52
+ if link_queue.empty? and page_queue.empty?
53
+ until link_queue.num_waiting == @tentacles.size
54
+ Thread.pass
55
+ end
56
+ if page_queue.empty?
57
+ @tentacles.size.times { link_queue << :END }
58
+ break
59
+ end
60
+ end
61
+ end
62
+
63
+ @tentacles.each { |thread| thread.join }
64
+ do_after_crawl_blocks
65
+ self
66
+ end
67
+ end
68
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: socializer-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikhil Gupta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-04 00:00:00.000000000 Z
11
+ date: 2014-04-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler