RubyGems - socializer-scraper - Versions diffs - 0.0.1 - Mend

socializer-scraper 0.0.1

Files changed (21) hide show

checksums.yaml +7 -0
data/.gitignore +20 -0
data/.rspec +2 -0
data/.travis.yml +3 -0
data/Gemfile +4 -0
data/Guardfile +8 -0
data/LICENSE.txt +22 -0
data/README.md +29 -0
data/Rakefile +6 -0
data/bin/socializer-scraper +3 -0
data/lib/socializer/scraper/collector.rb +84 -0
data/lib/socializer/scraper/detector.rb +15 -0
data/lib/socializer/scraper/extensions.rb +109 -0
data/lib/socializer/scraper/extractor.rb +89 -0
data/lib/socializer/scraper/validator.rb +15 -0
data/lib/socializer/scraper/version.rb +5 -0
data/lib/socializer/scraper.rb +14 -0
data/socializer-scraper.gemspec +30 -0
data/spec/socializer/scraper_spec.rb +11 -0
data/spec/spec_helper.rb +2 -0
metadata +179 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: e3be782e7e3dd71ba2ec0b6c8cd7ffb35c65305f
+  data.tar.gz: f548cdcfe90f8e3370a9d3148c759fd623765a89
+SHA512:
+  metadata.gz: 9a37596beb23d40ab660d9f5becdf875033f9344fbebfb781d4ced69dd5e98baf274a72764c26f783a3a9b2451f0e3c3422a54e247f698ad225f24778010c52a
+  data.tar.gz: 0d9a89741130f94df7fb993a84de6d1aba5d2d7ab290954dc5b9d4141940f778dbe88a260521b86896208a96b1ca54ad33d8865f38c74269462d231ee8d92c9a

data/.gitignore ADDED Viewed

@@ -0,0 +1,20 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+data/
+tags
+scripts/

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --format documentation
2	+ --color

data/.travis.yml ADDED Viewed

@@ -0,0 +1,3 @@
+language: ruby
+rvm:
+  - 2.0.0

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in socializer-scraper.gemspec
+gemspec

data/Guardfile ADDED Viewed

@@ -0,0 +1,8 @@
+# A sample Guardfile
+# More info at https://github.com/guard/guard#readme
+guard 'yard' do
+  watch(%r{app/.+\.rb})
+  watch(%r{lib/.+\.rb})
+  watch(%r{ext/.+\.c})
+end

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Nikhil Gupta
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Socializer::Scraper
+TODO: Write a gem description
+## Installation
+Add this line to your application's Gemfile:
+    gem 'socializer-scraper'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install socializer-scraper
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/socializer-scraper ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env ruby
+require 'socializer/scraper'

data/lib/socializer/scraper/collector.rb ADDED Viewed

@@ -0,0 +1,84 @@
+module Socializer
+  module Scraper
+    module Collector
+      def email_collector
+        at_re = "(@|\\s*[\\[|\\(|\\{](@|at)[\\]|\\)|\\}]\\s*)"
+        dt_re = "(\\.|\\s*[\\[|\\(|\\{](\\.|dot)[\\]|\\)|\\}]\\s*)"
+        regex = /([A-Z0-9._%-]+#{at_re}([A-Z0-9-]+#{dt_re})+[A-Z]{2,4})/i
+        emails  = @page.body.scan(regex).map do |a|
+          "mailto:" + a[0].gsub(a[1], "@").gsub(a[4], ".")
+        end rescue []
+        (emails | page_links).map do |e|
+          uri = URI.parse(URI.encode(e))
+          uri.to if uri.respond_to?(:to)
+        end.compact
+      end
+      def sitemap_collector
+        @current_url
+      end
+      def link_collector
+        page_links.map do |link|
+          begin
+            uri = URI.parse(link).absolute(@url.host, @url.scheme)
+            case
+            when uri.url? && uri.host == @url.host then { internal: uri.to_s }
+            when uri.url? then { external: link }
+            when uri.scheme then { uri.scheme.to_sym => link }
+            else { unknown: link }
+            end
+          rescue URI::InvalidURIError
+            { unknown: link }
+          end
+        end.collect_as_hash
+      end
+      def live_link_collector
+        page_links.map do |link|
+          begin
+            uri = URI.parse(link).absolute(@url.host, @url.scheme)
+            case
+            when uri.respond_to?(:error?) && (error = uri.error?)
+              then { error => uri.to_s }
+            when uri.url? && uri.host == @url.host
+              then { internal: uri.to_s }
+            when uri.url? then { external: link }
+            when uri.scheme then { uri.scheme => link }
+            else { unknown: link }
+            end
+          rescue URI::InvalidURIError
+            { unknown: link }
+          end
+        end.collect_as_hash
+      end
+      def social_profile_collector options = {}
+        default  = [ :facebook, :twitter, :github ]
+        required = options.select{ |k, v| v}.keys
+        allowed  = if options.empty?
+                     default
+                   elsif required.any?
+                     required - (required - default)
+                   else
+                     default - options.keys
+                   end
+        allowed = allowed.map{ |a| { a => [] } }.collect_as_hash
+        allowed.hash_map do |provider|
+          regex = /#{provider}\.com\/[^\/]*$/
+          links = page_links.map do |link|
+            link =~ regex ? link : nil
+          end.accumulate
+          [provider, links]
+        end
+      end
+    end
+  end
+end

data/lib/socializer/scraper/detector.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module Socializer
+  module Scraper
+    module Detector
+      def similar_pages
+      end
+      def ssl_page_exists?
+      end
+    end
+  end
+end

data/lib/socializer/scraper/extensions.rb ADDED Viewed

@@ -0,0 +1,109 @@
+class Array
+  def hash_collection?
+    flatten.compact.reject{|v| v.is_a?(Hash)}.empty?
+  end
+  def collect_as_hash
+    raise StandardError, "Array is not a hash collection!" unless hash_collection?
+    flatten.compact.each_with_object(Hash.new([])) do |h1,h|
+      h1.each{|k,v| h[k] = (h[k] | [v]).accumulate }
+    end
+  end
+  def accumulate
+    flatten.compact.uniq
+  end
+  def hashify_or_collect
+    hash_collection? ? collect_as_hash : accumulate
+  end
+  def extract_options!
+    last.is_a?(Hash) && last.instance_of?(Hash) ? pop : {}
+  end
+end
+class Hash
+  def hash_map &block
+    Hash[self.map{|key, value| yield(key, value) }]
+  end
+  def hash_collection?
+    true
+  end
+  def collect_as_hash
+    self
+  end
+  alias :hashify_or_collect :collect_as_hash
+end
+class String
+  def url?
+    self =~ /^#{URI::regexp}$/
+  end
+  def blank?
+    strip.empty?
+  end
+end
+module URI
+  class Generic
+    def url?
+      %w[ http https ].include?(scheme)
+    end
+    def mail?
+      scheme == "mailto"
+    end
+    def absolute(host, scheme = nil)
+      return self unless self.scheme.nil?
+      path = to_s.start_with?("/") ? to_s : "/#{to_s}"
+      URI.parse("#{scheme.blank? ? "http" : scheme}://#{host}#{path}")
+    end
+  end
+  class HTTP
+    def error?
+      return :unknown unless url?
+      puts "Testing URL: #{self}"
+      req = Net::HTTP.new(host, port)
+      req.use_ssl = is_a?(URI::HTTPS)
+      res = req.request_head(path.empty? ? "/" : path)
+      if res.kind_of?(Net::HTTPRedirection)
+        URI.parse(res["location"]).absolute(host, scheme).error?
+      else
+        case
+        when res.code == "401" || res.code == "407" then :unauthorized
+        when res.code == "403" then :forbidden
+        when res.code == "404" then :not_found
+        when res.code[0] == "4" then :client_error
+        when res.code == "503" then :temporary_server_error
+        when res.code[0] == "5" then :server_error
+        end
+      end
+    rescue ::Errno::ENOENT, ::SocketError
+      :no_such_server
+    end
+  end
+end
+class Object
+  def accumulate
+    [ self ].accumulate
+  end
+  def blank?
+    obj = obj.strip if respond_to?(:strip)
+    obj.respond_to?(:empty?) ? obj.empty? : !obj
+  end
+  def present?
+    !blank?
+  end
+end

data/lib/socializer/scraper/extractor.rb ADDED Viewed

@@ -0,0 +1,89 @@
+module Socializer
+  module Scraper
+    class Extractor
+      include Socializer::Scraper::Collector
+      attr_reader :url
+      attr_writer :collectors
+      def initialize options = {}
+        self.url = options.fetch(:url, nil)
+        self.collectors = options.fetch(:collectors, [])
+      end
+      # Set the URL to crawl for this Crawler instance.
+      #
+      # @param url [string] URL or domain name to crawl.
+      # @return string url
+      def url= url
+        return unless url
+        @url = URI.parse(url)
+        message = "Please, provide a URL that starts with HTTP or HTTPS"
+        raise URI::InvalidURIError, message unless @url.url?
+      end
+      def collectors
+        @collectors.any? ? @collectors : self.class.available_collectors
+      end
+      def run *patterns, &block
+        data, options = {}, patterns.extract_options!
+        page_wise = options.fetch(:page_wise, false)
+        perform(*patterns) do |page|
+          collectors.each do |collector|
+            found = send("#{collector}_collector")
+            yield(page, collector, found) if block_given?
+            if page_wise
+              data[collector] ||= {}
+              data[collector][@current_url] = found
+            else
+              data[collector] ||= []
+              data[collector].push found
+            end
+          end
+        end
+        data.hash_map{|kind, list| [kind, list.hashify_or_collect]}
+      end
+      class << self
+        def available_collectors
+          self.instance_methods.select do |name|
+            name.to_s.end_with?("_collector")
+          end.map do |name|
+            name.to_s.gsub(/_collector$/, '').to_sym
+          end
+        end
+      end
+      protected
+      def page_html
+        @html ||= Nokogiri::HTML(@page.body)
+      end
+      def page_links
+        page_html.search("a").map{|a| a.attr("href")}.accumulate
+      end
+      private
+      def perform *patterns, &block
+        message = "Please, provide a URL that starts with HTTP or HTTPS"
+        raise URI::InvalidURIError, message unless @url.url?
+        patterns.push(/.*/) if patterns.empty?
+        Anemone.crawl(@url) do |anemone|
+          anemone.storage = Anemone::Storage.MongoDB
+          anemone.on_pages_like(*patterns) do |page|
+            @page, @html, @current_url = page, nil, page.url
+            yield(page)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/socializer/scraper/validator.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module Socializer
+  module Scraper
+    module Validator
+      def url_valid?
+      end
+      def mail_valid?
+      end
+    end
+  end
+end

data/lib/socializer/scraper/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Socializer
+  module Scraper
+    VERSION = "0.0.1"
+  end
+end

data/lib/socializer/scraper.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require "uri"
+require "anemone"
+require "socializer/scraper/version"
+require "socializer/scraper/extensions"
+require "socializer/scraper/collector"
+require "socializer/scraper/extractor"
+module Socializer
+  module Scraper
+    # Your code goes here...
+  end
+end

data/socializer-scraper.gemspec ADDED Viewed

@@ -0,0 +1,30 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'socializer/scraper/version'
+Gem::Specification.new do |spec|
+  spec.name          = "socializer-scraper"
+  spec.version       = Socializer::Scraper::VERSION
+  spec.authors       = ["Nikhil Gupta"]
+  spec.email         = ["me@nikhgupta.com"]
+  spec.description   = %q{Various scrapers for the Socializer application.}
+  spec.summary       = %q{Various scrapers for the Socializer application.}
+  spec.homepage      = "http://nikhgupta.com"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "yard"
+  spec.add_development_dependency "guard-yard"
+  spec.add_dependency "bson_ext"
+  spec.add_dependency "mongo"
+  spec.add_dependency "anemone"
+end

data/spec/socializer/scraper_spec.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'spec_helper'
+describe Socializer::Scraper do
+  it 'should have a version number' do
+    Socializer::Scraper::VERSION.should_not be_nil
+  end
+  it 'should do something useful' do
+    false.should be_true
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2	+ require 'socializer/scraper'

metadata ADDED Viewed

@@ -0,0 +1,179 @@
+--- !ruby/object:Gem::Specification
+name: socializer-scraper
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Nikhil Gupta
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-04-04 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: guard-yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bson_ext
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: mongo
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: anemone
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Various scrapers for the Socializer application.
+email:
+- me@nikhgupta.com
+executables:
+- socializer-scraper
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- .travis.yml
+- Gemfile
+- Guardfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/socializer-scraper
+- lib/socializer/scraper.rb
+- lib/socializer/scraper/collector.rb
+- lib/socializer/scraper/detector.rb
+- lib/socializer/scraper/extensions.rb
+- lib/socializer/scraper/extractor.rb
+- lib/socializer/scraper/validator.rb
+- lib/socializer/scraper/version.rb
+- socializer-scraper.gemspec
+- spec/socializer/scraper_spec.rb
+- spec/spec_helper.rb
+homepage: http://nikhgupta.com
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Various scrapers for the Socializer application.
+test_files:
+- spec/socializer/scraper_spec.rb
+- spec/spec_helper.rb
+has_rdoc: