RubyGems - socializer-scraper - Versions diffs - 0.1.0 → 0.1.1 - Mend

socializer-scraper 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/bin/socializer-scraper +6 -11
data/lib/socializer/scraper/extractor.rb +14 -1
data/lib/socializer/scraper/version.rb +1 -1
data/lib/socializer/scraper.rb +54 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b696c3456d065cd7a801ade53292c16484e63044
-  data.tar.gz: 2547800d0bd1bcbe014e5d093e714e110904a1f3
+  metadata.gz: b7ab615984c9d9cc1c63b1f3525677f3cea346ab
+  data.tar.gz: 301234730dc644a79f14cde68630c581d7f11212
 SHA512:
-  metadata.gz: a8b80c6e98fe9389318e35d55c2265b7979fa5173c122b947630b64e5aa12c7bed514974f353391873c289ac5c8726592ecc4d527e689af5fe00c662be54c90b
-  data.tar.gz: 633af1996f78a6fe2c5c0be2f7d3007966342be622806e1b2cc8076f666a4f18b377db98d7fb706c3c50e2290a8b49eb0f0e5fa7e787d6936edd855fd110d4c7
+  metadata.gz: 2b9b2249243883434e2958fc8e2e42275359893955cea4badccfc2d56ed9f23023d492d02193968a59968b3fb7280d89f7bdf6d692cf6c551c1ce124943bd663
+  data.tar.gz: 64236e958b29d358a15103343c19aab8859e7cc80220c1fd69feaed5c7fd9589047d9075a32d17c3a94b90ba2ef8ad2ca8d578e30bd16cd947e776b732ca297b

data/bin/socializer-scraper CHANGED Viewed

@@ -10,6 +10,7 @@ STDOUT.sync = true
 class Socializer::Scraper::CLI < Thor
   desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
+  method_options pattern: :string, desc: "Comma separate list of patterns that selects which links to follow"
   def emails(*urls)
     extractor = Socializer::Scraper::Extractor.new collectors: [:email]
     urls.each do |website|
@@ -19,13 +20,15 @@ class Socializer::Scraper::CLI < Thor
       puts "Scraping website: #{website}"
       puts "=" * 100
-      website = URI.parse("http://#{website}") unless website.start_with?("http")
+      website = URI.parse(website.start_with?("http") ? website : "http://#{website}")
       file = File.join(Dir.pwd, "#{website.host}.yml")
       counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
+      patterns = options.has_key?("pattern") ? options["pattern"].split(",").map{|a| Regexp.new a} : []
       extractor.url = website.to_s
-      extractor.run do |page, collector, found|
-        found  = found.map{ |email| email.strip }.accumulate - list
+      extractor.run(*patterns) do |page, collector, found|
+        found  = found.map{ |email| email.strip.downcase }.accumulate - list
         list  |= found
         found = found.count
@@ -51,11 +54,3 @@ class Socializer::Scraper::CLI < Thor
 end
 Socializer::Scraper::CLI.start ARGV
-websites = %w[
-  www.thegearpage.net
-  www.hugeracksin.com
-  www.rig-talk.com
-  www.guitariste.com
-  www.tonequest.com
-]

data/lib/socializer/scraper/extractor.rb CHANGED Viewed

@@ -79,13 +79,26 @@ module Socializer
         patterns.push(/.*/) if patterns.empty?
         Anemone.crawl(@url, options) do |anemone|
+          anemone.threads = 2
+          anemone.verbose = true
+          anemone.obey_robots_txt = true
+          anemone.accept_cookies = true
+          anemone.user_agent = "Googlebot"
           anemone.storage = Anemone::Storage.MongoDB
-          anemone.on_pages_like(*patterns) do |page|
+          anemone.focus_crawl{|page| links_matching(page.links, patterns) }
+          anemone.on_every_page do |page|
             @page, @html, @current_url = page, nil, page.url
             yield(page)
           end
         end
       end
+      def links_matching links, patterns = []
+        return links if patterns.empty?
+        links.select do |link|
+          patterns.detect{|p| link.to_s =~ p}
+        end
+      end
     end
   end
 end

data/lib/socializer/scraper/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Socializer
   module Scraper
-    VERSION = "0.1.0"
+    VERSION = "0.1.1"
   end
 end

data/lib/socializer/scraper.rb CHANGED Viewed

@@ -12,3 +12,57 @@ module Socializer
     # Your code goes here...
   end
 end
+module Anemone
+  class Core
+    #
+    # Perform the crawl
+    #
+    def run
+      process_options
+      @urls.delete_if { |url| !visit_link?(url) }
+      return if @urls.empty?
+      link_queue = Queue.new
+      page_queue = Queue.new
+      @opts[:threads].times do
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
+      end
+      @urls.each{ |url| link_queue.enq(url) }
+      loop do
+        page = page_queue.deq
+        @pages.touch_key page.url
+        print "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
+        do_page_blocks page
+        page.discard_doc! if @opts[:discard_page_bodies]
+        links = links_to_follow page
+        links.each do |link|
+          link_queue << [link, page.url.dup, page.depth + 1]
+        end
+        @pages.touch_keys links
+        @pages[page.url] = page
+        # if we are done with the crawl, tell the threads to end
+        if link_queue.empty? and page_queue.empty?
+          until link_queue.num_waiting == @tentacles.size
+            Thread.pass
+          end
+          if page_queue.empty?
+            @tentacles.size.times { link_queue << :END }
+            break
+          end
+        end
+      end
+      @tentacles.each { |thread| thread.join }
+      do_after_crawl_blocks
+      self
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: socializer-scraper
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Nikhil Gupta
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-04-04 00:00:00.000000000 Z
+date: 2014-04-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler