RubyGems - rextract - Versions diffs - 0.1.5 - Mend

rextract 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/.document +5 -0
data/.rspec +1 -0
data/Gemfile +20 -0
data/Gemfile.lock +89 -0
data/README.markdown +56 -0
data/Rakefile +47 -0
data/VERSION +1 -0
data/bin/rextract +45 -0
data/lib/rextract.rb +6 -0
data/lib/rextract/browser.rb +72 -0
data/lib/rextract/parser.rb +38 -0
data/lib/rextract/spider.rb +17 -0
data/rextract.gemspec +105 -0
data/spec/rextract/browser_spec.rb +94 -0
data/spec/rextract/parser_spec.rb +58 -0
data/spec/rextract/spider_spec.rb +46 -0
data/spec/rextract_spec.rb +7 -0
data/spec/spec_helper.rb +20 -0
data/templates/app/run +5 -0
data/templates/job/%job_name%.rb.tt +4 -0
data/templates/job/parser.rb.tt +25 -0
data/templates/job/spider.rb.tt +45 -0
data/templates/test.thor +55 -0
metadata +223 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/.rspec ADDED

	@@ -0,0 +1 @@
1	+ --color -f d

data/Gemfile ADDED

@@ -0,0 +1,20 @@
+source "http://rubygems.org"
+gem "mechanize", '~> 2.3'
+gem 'nokogiri', '~> 1.4'
+gem "thor", "~> 0.14"
+gem 'activesupport', '~> 3.2'
+gem 'i18n', '0.6.0'
+# Add dependencies to develop your gem here.
+# Include everything needed to run rake, tests, features, etc.
+group :development do
+  gem "rspec", "~> 2.8.0"
+  gem "bundler", "~> 1.0.0"
+  gem "jeweler",    "~> 1.5.2"
+  gem "rcov",       ">= 0", :platforms => :mri_18
+	gem 'simplecov', :require => false, :platforms => :mri_19
+  gem "fakeweb",    "1.3.0"
+  gem "ruby-debug", "0.10.4", :platforms => :mri_18
+  gem "ruby-debug19", "0.11.6", :platforms => :mri_19
+end

data/Gemfile.lock ADDED

@@ -0,0 +1,89 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    activesupport (3.2.2)
+      i18n (~> 0.6)
+      multi_json (~> 1.0)
+    archive-tar-minitar (0.5.2)
+    columnize (0.3.6)
+    diff-lcs (1.1.3)
+    domain_name (0.5.2)
+      unf (~> 0.0.3)
+    fakeweb (1.3.0)
+    git (1.2.5)
+    i18n (0.6.0)
+    jeweler (1.5.2)
+      bundler (~> 1.0.0)
+      git (>= 1.2.5)
+      rake
+    linecache (0.46)
+      rbx-require-relative (> 0.0.4)
+    linecache19 (0.5.12)
+      ruby_core_source (>= 0.1.4)
+    mechanize (2.3)
+      domain_name (~> 0.5, >= 0.5.1)
+      mime-types (~> 1.17, >= 1.17.2)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (~> 2.5, >= 2.5.2)
+      nokogiri (~> 1.4)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (~> 0.0, >= 0.0.9)
+    mime-types (1.17.2)
+    multi_json (1.1.0)
+    net-http-digest_auth (1.2)
+    net-http-persistent (2.5.2)
+    nokogiri (1.5.2)
+    ntlm-http (0.1.1)
+    rake (0.9.2.2)
+    rbx-require-relative (0.0.9)
+    rcov (1.0.0)
+    rspec (2.8.0)
+      rspec-core (~> 2.8.0)
+      rspec-expectations (~> 2.8.0)
+      rspec-mocks (~> 2.8.0)
+    rspec-core (2.8.0)
+    rspec-expectations (2.8.0)
+      diff-lcs (~> 1.1.2)
+    rspec-mocks (2.8.0)
+    ruby-debug (0.10.4)
+      columnize (>= 0.1)
+      ruby-debug-base (~> 0.10.4.0)
+    ruby-debug-base (0.10.4)
+      linecache (>= 0.3)
+    ruby-debug-base19 (0.11.25)
+      columnize (>= 0.3.1)
+      linecache19 (>= 0.5.11)
+      ruby_core_source (>= 0.1.4)
+    ruby-debug19 (0.11.6)
+      columnize (>= 0.3.1)
+      linecache19 (>= 0.5.11)
+      ruby-debug-base19 (>= 0.11.19)
+    ruby_core_source (0.1.5)
+      archive-tar-minitar (>= 0.5.2)
+    simplecov (0.6.1)
+      multi_json (~> 1.0)
+      simplecov-html (~> 0.5.3)
+    simplecov-html (0.5.3)
+    thor (0.14.6)
+    unf (0.0.5)
+      unf_ext
+    unf_ext (0.0.4)
+    webrobots (0.0.13)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  activesupport (~> 3.2)
+  bundler (~> 1.0.0)
+  fakeweb (= 1.3.0)
+  i18n (= 0.6.0)
+  jeweler (~> 1.5.2)
+  mechanize (~> 2.3)
+  nokogiri (~> 1.4)
+  rcov
+  rspec (~> 2.8.0)
+  ruby-debug (= 0.10.4)
+  ruby-debug19 (= 0.11.6)
+  simplecov
+  thor (~> 0.14)

data/README.markdown ADDED

@@ -0,0 +1,56 @@
+# rextract
+A web scraping framework using Matt Thorley's [Dirt](https://github.com/mthorley/dirt) pattern.
+The idea is that you
+    gem install rextract
+    rextract project new-scraping-project -j job1,job2
+and it'll create a new scraping project for you and create a template of scripts, spiders and parsers so you can quickly create scraping projects. The -j option creates a job folder with a spider and a parser for each named, comma-separated job.
+## Concepts
+### Spiders
+Spiders inherit from Rextract::Browser which is essentially Mechanize with some helpers. You use Mechanize's methods to get, post and do all the fancy browsing and create some methods to return the body content you want for given pages. Anything more complicated than a couple of xpaths or css selectors you should stick in a Parser.
+The spider saves all the content scraped to disk by default in directories named by url and timestamp. This makes it so you can alter parsers w/o having to re-download the content.
+### Parsers
+Parsers inherit from Rextract::Parser. You define methods with the prefix 'parse_' and they will all get called automagically and their data will return as a hash of results. When you create a new Parser object, pass in the body content into the .new() method, then call #parse on the object.
+## Contributing to rextract
+* Contributions are super much appreciated!
+* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
+* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
+* Fork the project
+* Start a feature/bugfix branch
+* Commit and push until you are happy with your contribution
+* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
+* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
+## Copyright and License
+Copyright (c) 2011 JT Zemp and contributors. Licensed under MIT.
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Rakefile ADDED

@@ -0,0 +1,47 @@
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "rextract"
+  gem.homepage = "http://github.com/jtzemp/rextract"
+  gem.license = "MIT"
+  summary = %q{A web scraping framework based on Mechanize using the Dirt pattern. See https://github.com/jtzemp/rextract for more details.}
+  gem.summary = summary
+  gem.description = summary
+  gem.email = "jtzemp@gmail.com"
+  gem.authors = ["JT Zemp"]
+ end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+RSpec::Core::RakeTask.new(:rcov) do |spec|
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+task :default => :spec
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "rextract #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.1.5

data/bin/rextract ADDED

@@ -0,0 +1,45 @@
+#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
+require 'rubygems'
+require 'thor'
+require 'thor/group'
+require 'active_support/all'
+class RextractApp < Thor
+  include Thor::Actions
+  attr_accessor :job_name
+  def self.source_root
+    File.dirname(__FILE__) + "/../templates"
+  end
+  desc "project", "creates a new rextract project and optionally a number of jobs by passing -j job1,job2,job3"
+  method_option :jobs, :aliases => '-j', :desc => "create named jobs as well", :default => ""
+  def project(name)
+    jobs = options[:jobs]
+    jobs = jobs.split(",").map(&:strip)
+    puts "create project #{name}"
+    empty_directory(name)
+    template('app/run', "#{name}/run")
+    chmod("#{name}/run", 0744)
+   jobs.each do |job_name|
+      create_job(name, job_name)
+    end
+  end
+  def create_job(project_name, job_name)
+    @job_name = job_name # hack for template parsing and interpolation
+    puts "creating #{job_name} job"
+    directory("job", "#{project_name}/#{job_name}")
+  end
+ desc "job", "creates a new job"
+  def job(job_name)
+    create_job(job_name)
+  end
+end
+RextractApp.start

data/lib/rextract.rb ADDED

@@ -0,0 +1,6 @@
+module Rextract
+  require 'rextract/browser'
+  require 'rextract/spider'
+  require 'rextract/parser'
+  VERSION = File.open(File.expand_path(File.dirname(__FILE__)) + "/../VERSION").read
+end

data/lib/rextract/browser.rb ADDED

@@ -0,0 +1,72 @@
+require 'rextract'
+require 'mechanize'
+require 'time'
+module Rextract
+  module ArchiveResponse
+    def initialize(*args)
+      @archive_dir=nil
+      super(*args)
+    end
+    def default_archive_dir(base = "~/tmp/")
+      File.expand_path(base + Time.now.strftime("%Y-%m-%d_%H-%M-%S/"))
+    end
+    def archive_dir
+      @archive_dir || (archive_dir = default_archive_dir)
+    end
+    def ensure_dir(dir_path)
+      FileUtils.mkdir_p(dir_path) unless File.exists?(dir_path)
+    end
+    def archive_dir=(dir_path)
+      dir = File.expand_path(dir_path)
+      ensure_dir(dir_path)
+      @archive_dir = dir
+    end
+    def get(*args)
+      url = args.is_a?(Hash) ? args[:url] : args.first
+      body_path   = "#{archive_dir}/#{sanitize_url(url)}.html"
+      header_path = "#{archive_dir}/#{sanitize_url(url)}.headers"
+      ensure_dir(archive_dir)
+      page = super(*args)
+      write_to_file(body_path, page.body.to_s)
+      header_output = ''
+      PP.pp(page.header, header_output)
+      write_to_file(header_path, header_output)
+      page
+    end
+    def write_to_file(path, data)
+      File.open(path, "w+") do |f|
+        f.write(data)
+      end
+    end
+    def sanitize_url(url)
+      url.gsub(/[^A-z0-9_\-\.]/, "_")
+    end
+  end
+  module LogRequests
+    def get(*args)
+      url = args.is_a?(Hash) ? args[:url] : args.first
+      $stderr.puts "[#{Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")}] Requesting URL #{url}" # TODO Switch the STDERR.puts call with something that uses a _real_ logger
+      super(*args)
+    end
+  end
+  class Browser < Mechanize
+    include ArchiveResponse
+    include LogRequests
+  end
+end

data/lib/rextract/parser.rb ADDED

@@ -0,0 +1,38 @@
+require 'rextract'
+require 'nokogiri'
+class Rextract::Parser
+  attr_reader :content, :doc, :opts
+  def initialize(content, *opts)
+    @opts = opts
+    case content
+    when content.respond_to?(:parser) && content.respond_to?(:html_body)
+      @doc     = content.parser
+      @content = content.html_body
+    when content.respond_to?(:body)
+      @doc     = content
+      @content = content.body
+    else
+      @doc     = Nokogiri::HTML(content.to_s)
+      @content = content.to_s
+    end
+  end
+  def extract(regex)
+    @content.scan(regex).flatten.first
+  end
+  def parsing_methods
+    (self.methods - self.class.superclass.methods).collect{|m| m if m =~ /parse_/}.compact
+  end
+  def parse
+    result = {}
+    parsing_methods.each do |method|
+      key = method.to_s.scan(/parse_(\w*)/).flatten.first.to_sym
+      result[key] = self.send(method.to_sym)
+    end
+    result
+  end
+end

data/lib/rextract/spider.rb ADDED

@@ -0,0 +1,17 @@
+require 'rextract'
+class Rextract::Spider
+  attr_reader :agent
+  def initialize(opts = {})
+    @agent = Rextract::Browser.new do |a|
+      a.user_agent_alias    = opts[:user_agent_alias] || 'Mac Safari'
+      a.follow_meta_refresh = true
+      a.auth(opts[:user], opts[:password]) if opts.has_key?(:user) && opts.has_key?(:password)
+      opts.each do |key, val|
+        method = "#{key}="
+        a.send(method, val) if a.respond_to?(method)
+      end
+    end
+  end
+end

data/rextract.gemspec ADDED

@@ -0,0 +1,105 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = "rextract"
+  s.version = "0.1.5"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["JT Zemp"]
+  s.date = "2012-03-12"
+  s.description = "A web scraping framework based on Mechanize using the Dirt pattern. See https://github.com/jtzemp/rextract for more details."
+  s.email = "jtzemp@gmail.com"
+  s.executables = ["rextract"]
+  s.extra_rdoc_files = [
+    "README.markdown"
+  ]
+  s.files = [
+    ".document",
+    ".rspec",
+    "Gemfile",
+    "Gemfile.lock",
+    "README.markdown",
+    "Rakefile",
+    "VERSION",
+    "bin/rextract",
+    "lib/rextract.rb",
+    "lib/rextract/browser.rb",
+    "lib/rextract/parser.rb",
+    "lib/rextract/spider.rb",
+    "rextract.gemspec",
+    "spec/rextract/browser_spec.rb",
+    "spec/rextract/parser_spec.rb",
+    "spec/rextract/spider_spec.rb",
+    "spec/rextract_spec.rb",
+    "spec/spec_helper.rb",
+    "templates/app/run",
+    "templates/job/%job_name%.rb.tt",
+    "templates/job/parser.rb.tt",
+    "templates/job/spider.rb.tt",
+    "templates/test.thor"
+  ]
+  s.homepage = "http://github.com/jtzemp/rextract"
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = "1.8.11"
+  s.summary = "A web scraping framework based on Mechanize using the Dirt pattern. See https://github.com/jtzemp/rextract for more details."
+  s.test_files = [
+    "spec/rextract/browser_spec.rb",
+    "spec/rextract/parser_spec.rb",
+    "spec/rextract/spider_spec.rb",
+    "spec/rextract_spec.rb",
+    "spec/spec_helper.rb"
+  ]
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<mechanize>, ["~> 2.3"])
+      s.add_runtime_dependency(%q<nokogiri>, ["~> 1.4"])
+      s.add_runtime_dependency(%q<thor>, ["~> 0.14"])
+      s.add_runtime_dependency(%q<activesupport>, ["~> 3.2"])
+      s.add_runtime_dependency(%q<i18n>, ["= 0.6.0"])
+      s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
+      s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_development_dependency(%q<rcov>, [">= 0"])
+      s.add_development_dependency(%q<simplecov>, [">= 0"])
+      s.add_development_dependency(%q<fakeweb>, ["= 1.3.0"])
+      s.add_development_dependency(%q<ruby-debug>, ["= 0.10.4"])
+      s.add_development_dependency(%q<ruby-debug19>, ["= 0.11.6"])
+    else
+      s.add_dependency(%q<mechanize>, ["~> 2.3"])
+      s.add_dependency(%q<nokogiri>, ["~> 1.4"])
+      s.add_dependency(%q<thor>, ["~> 0.14"])
+      s.add_dependency(%q<activesupport>, ["~> 3.2"])
+      s.add_dependency(%q<i18n>, ["= 0.6.0"])
+      s.add_dependency(%q<rspec>, ["~> 2.8.0"])
+      s.add_dependency(%q<bundler>, ["~> 1.0.0"])
+      s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_dependency(%q<rcov>, [">= 0"])
+      s.add_dependency(%q<simplecov>, [">= 0"])
+      s.add_dependency(%q<fakeweb>, ["= 1.3.0"])
+      s.add_dependency(%q<ruby-debug>, ["= 0.10.4"])
+      s.add_dependency(%q<ruby-debug19>, ["= 0.11.6"])
+    end
+  else
+    s.add_dependency(%q<mechanize>, ["~> 2.3"])
+    s.add_dependency(%q<nokogiri>, ["~> 1.4"])
+    s.add_dependency(%q<thor>, ["~> 0.14"])
+    s.add_dependency(%q<activesupport>, ["~> 3.2"])
+    s.add_dependency(%q<i18n>, ["= 0.6.0"])
+    s.add_dependency(%q<rspec>, ["~> 2.8.0"])
+    s.add_dependency(%q<bundler>, ["~> 1.0.0"])
+    s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+    s.add_dependency(%q<rcov>, [">= 0"])
+    s.add_dependency(%q<simplecov>, [">= 0"])
+    s.add_dependency(%q<fakeweb>, ["= 1.3.0"])
+    s.add_dependency(%q<ruby-debug>, ["= 0.10.4"])
+    s.add_dependency(%q<ruby-debug19>, ["= 0.11.6"])
+  end
+end

data/spec/rextract/browser_spec.rb ADDED

@@ -0,0 +1,94 @@
+require 'spec_helper'
+class SpecArchiveWrapperParent
+  def get(*args)
+    OpenStruct.new(body: "body", header: [{ "header key" => "header value"}])
+  end
+end
+class SpecArchiveWrapper < SpecArchiveWrapperParent
+  include Rextract::ArchiveResponse
+end
+describe Rextract::ArchiveResponse do
+  before(:each) do
+    @alphabet = ("a".."z").to_a + ("A".."Z").to_a
+    @random_dir = Dir.tmpdir + "/" + @alphabet.shuffle[rand(@archive_dir),9].shuffle.join
+  end
+  after(:each) do
+    FileUtils.rm_rf(@random_dir)
+  end
+  describe "#archive_dir=" do
+    it "creates the directory (and it's parent directories) if it doesn't exist" do
+      rar = SpecArchiveWrapper.new
+      File.should_not be_exists(@random_dir)
+      rar.archive_dir = @random_dir
+      rar.archive_dir.should == @random_dir
+      File.should be_exists(@random_dir)
+    end
+  end
+  describe "#get" do
+    it "wraps super(*args) and saves the output to $archive_dir/$something" do
+      rar = SpecArchiveWrapper.new
+      rar.archive_dir = @random_dir
+      File.should_not be_exists("#{@random_dir}/a.html")
+      page = rar.get('a')
+      page.body.should == "body"
+      page.header.should == [{"header key"=>"header value"}]
+      File.should be_exists("#{@random_dir}/a.html")
+    end
+  end
+  describe "#default_archive_dir" do
+    it "takes an optional base param and returns a string representing the base path and the current timestamp" do
+      SpecArchiveWrapper.new.default_archive_dir.should match /^\/.*\/tmp\/\d{4}+-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}/
+    end
+  end
+  describe "#write_to_file" do
+    it "takes a path, and a string for data and writes the data to the path" do
+      path = "#{@random_dir}/test.txt"
+      File.should_not be_exists(path)
+      FileUtils.mkdir_p(@random_dir)
+      SpecArchiveWrapper.new.write_to_file(path, "test")
+      File.should be_exists(path)
+    end
+  end
+end
+describe Rextract::LogRequests do
+  before(:all) do
+    class SpecLogRequestsParent
+      def get(*args)
+        args
+      end
+    end
+    class SpecLogRequests < SpecLogRequestsParent
+      include Rextract::LogRequests
+    end
+  end
+  before(:each) do
+    $stderr = StringIO.new
+  end
+  after(:each) do
+    $stderr = STDERR
+  end
+  describe "#get" do
+    it "should log the get request URL to STDERR" do
+      slp = SpecLogRequests.new
+      slp.get("a", "b", "c").should == ["a", "b", "c"]
+      $stderr.rewind
+      $stderr.read.should match(/\[\d+-\d+-\d+\ \d+:\d+:\d+\.\d{3}] Requesting URL a/)
+    end
+  end
+end

data/spec/rextract/parser_spec.rb ADDED

@@ -0,0 +1,58 @@
+require 'spec_helper'
+describe Rextract::Parser do
+  before(:each) do
+    @html =<<EOHTML
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+   "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+	<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+	<title>test html</title>
+	<meta name="generator" content="TextMate http://macromates.com/">
+	<meta name="author" content="Author Name">
+	<!-- Date: 2011-01-19 -->
+</head>
+<body>
+<div id="content"><div id="left_nav">navbar</div></div>
+</body>
+</html>
+EOHTML
+  end
+  describe ".new" do
+    it "takes a string of HTML and loads it into Nokogiri" do
+      p = Rextract::Parser.new(@html)
+      p.doc.should be_a(Nokogiri::HTML::Document)
+      p.doc.css("#left_nav").inner_text.should == "navbar"
+    end
+  end
+  describe "#extract" do
+    it "takes a regular expression and returns the first match" do
+      p = Rextract::Parser.new(@html)
+      p.extract(%r{<meta name="author" content="(.+)"}).should == "Author Name"
+    end
+  end
+  describe "#parsing_methods" do
+    before(:all) do
+      class SampleParser < Rextract::Parser
+        def parse_generator ; return 'parse_generator'; end
+        def parse_nothing ; return 'parse_nothing'; end
+        def parse_lang ; return 'parse_lang'; end
+      end
+    end
+    it "returns an array of methods that start with 'parse_' " do
+      p = SampleParser.new(@html)
+      p.parsing_methods.sort.should == [:parse_lang, :parse_generator, :parse_nothing].sort
+    end
+  end
+  describe "#parse" do
+    it "returns a hash of all the output from the parse_* methods" do
+      p = SampleParser.new(@html)
+      p.parse.should == {:generator=>"parse_generator", :nothing=>"parse_nothing", :lang=>"parse_lang"}
+    end
+  end
+end

data/spec/rextract/spider_spec.rb ADDED

@@ -0,0 +1,46 @@
+require 'spec_helper'
+describe Rextract::Spider do
+  describe "#new" do
+    describe "options hash tries to set all options on the Mechanize agent" do
+      describe ":user_agent_alias" do
+        it "accepts :ua_alias and passes it to the Mechanize agent" do
+          spider = Rextract::Spider.new(:user_agent_alias => "Mac FireFox")
+          spider.agent.user_agent.should == "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6"
+        end
+        it ":ua_alias defaults to 'Mac Safari'" do
+          spider = Rextract::Spider.new
+          spider.agent.user_agent.should == "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
+        end
+      end
+      describe ":request_headers" do
+        it "accepts :request_headers as a Hash" do
+          spider = Rextract::Spider.new(:request_headers => {"Funky" => "Chicken"})
+          spider.agent.request_headers.should == {"Funky" => "Chicken"}
+        end
+        it "defaults to none" do
+          spider = Rextract::Spider.new
+          spider.agent.request_headers.should == {}
+        end
+      end
+      describe "simple authentication" do
+        it "attempts simple authentication if opts[:user] and opts[:password] are set" do
+          spider = Rextract::Spider.new(:user => 'goofball-user', :password => 'goofball-password')
+          spider.agent.inspect.should match(/@user=\"goofball-user\"/)
+          spider.agent.inspect.should match(/@password=\"goofball-password\"/)
+        end
+      end
+    end
+  end
+  describe "#agent" do
+    it "returns a Mechanize Agent" do
+      spider = Rextract::Spider.new
+      spider.agent.should be_a(Mechanize)
+    end
+  end
+end

data/spec/rextract_spec.rb ADDED

@@ -0,0 +1,7 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe "Rextract" do
+  it "doesn't fail" do
+    true.should == true
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,20 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'rspec'
+require 'rextract'
+require 'rextract/browser'
+require 'ruby-debug'
+require 'tempfile'
+require 'fileutils'
+require 'stringio'
+require 'ostruct'
+# Requires supporting files with custom matchers and macros, etc,
+# in ./support/ and its subdirectories.
+Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
+RSpec.configure do |config|
+end

data/templates/app/run ADDED

@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+# parse ARGV's
+# run the scraper(s) / parser(s) to get the data you want

data/templates/job/%job_name%.rb.tt ADDED

@@ -0,0 +1,4 @@
+module <%= job_name.classify %>
+  require '<%= job_name.classify.underscore %>/spider'
+  require '<%= job_name.classify.underscore %>/parser'
+end

data/templates/job/parser.rb.tt ADDED

@@ -0,0 +1,25 @@
+$LOAD_PATH << File.expand_path(File.dirname(__FILE__)) + "/.."
+require 'rextract'
+# The Parser classname can be named related to what's being parsed
+class <%= job_name.classify %>::Parser < Rextract::Parser
+  # create methods prepended with parse_* to have them recognized and run with #parse
+  # def parse_something
+  #
+  # end
+  #
+  # def parse_something_else
+  #
+  # end
+end
+if __FILE__ == $0
+  results = []
+  ARGV.each do |file|
+    results << <%= job_name.classify %>::Parser.new(File.read(file)).parse
+  end
+  STDERR.puts results.inspect
+end

data/templates/job/spider.rb.tt ADDED

@@ -0,0 +1,45 @@
+require '<%= @job_name %>'
+require 'rextract'
+class <%= @job_name.classify %>::Crawler < Rextract::Crawler
+  def login
+    signin_url = "http://www.example.com/login.php"
+    @agent.get(signin_url)
+    # set your own user and pass in private_data.rb
+    resp = @agent.post(signin_url, {:EmailUsername => config[:user], :Password => $pass, :SignIn => 'Log+In'})
+    @logged_in = true
+  end
+  def crawl(email)
+    login if !@logged_in
+    id = get_member_id(email)
+    return nil if !id
+    result = {
+      :id => id,
+      :public_profile => get_public_profile(id),
+      :friend_list => get_friend_list(id)
+    }
+  end
+  def get_member_id(email)
+    search_url = "http://www.bebo.com/c/search?SearchTerm=%s"
+    page = @agent.get search_url % email.strip
+    page ? page.body.scan(/AddAsFriendBox.forMemberId\((\d*)\)/).flatten.first : nil
+  end
+  def get_public_profile(id)
+    profile_url = "http://www.bebo.com/Profile.jsp?MemberId=%s" % id
+    page  = @agent.get profile_url % id
+    raise "Login Required" if page.body =~ LOGIN_REQUIRED
+    return nil if page.body =~ PRIVATE_PROFILE
+    page.body
+  end
+  def get_friend_list(id)
+    friends_url = "http://bebo.com/FriendList.jsp?MemberId=%s" % id
+    page  = @agent.get friends_url % id
+    return nil if page.body =~ PRIVATE_PROFILE
+    page.body
+  end
+end

data/templates/test.thor ADDED

@@ -0,0 +1,55 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'thor/group'
+require 'thor/actions'
+require 'ruby-debug'
+require 'active_support/inflector'
+class Thortest < Thor::Group
+  include Thor::Actions
+  # Define arguments and options
+  argument :job_name
+  #class_option :test_framework, :default => :test_unit
+  def self.source_root
+    r = File.dirname(__FILE__)
+    STDERR.puts "source_root: #{r}"
+    r
+  end
+  def self.dest_root
+    STDERR.puts "dest_root: #{Dir.pwd}"
+    Dir.pwd
+  end
+  def wookie
+    source      = "job"
+    destination = "./"
+    STDERR.puts "copying #{source} to #{destination}"
+    directory(source, destination)
+  end
+  # def create_lib_file
+  #   template('templates/newgem.tt', "#{name}/lib/#{name}.rb")
+  # end
+  #
+  # def create_test_file
+  #   test = options[:test_framework] == "rspec" ? :spec : :test
+  #   create_file "#{name}/#{test}/#{name}_#{test}.rb"
+  # end
+  #
+  # def copy_licence
+  #   if yes?("Use MIT license?")
+  #     # Make a copy of the MITLICENSE file at the source root
+  #     copy_file "MITLICENSE", "#{name}/MITLICENSE"
+  #   else
+  #     say "Shame on you...", :red
+  #   end
+  # end
+end
+puts "monkey at the end, but before start"
+#tt = Thortest.new(:name=>"funkd")
+Thortest.start

metadata ADDED

@@ -0,0 +1,223 @@
+--- !ruby/object:Gem::Specification
+name: rextract
+version: !ruby/object:Gem::Version
+  version: 0.1.5
+  prerelease:
+platform: ruby
+authors:
+- JT Zemp
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-03-12 00:00:00.000000000Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: &2164397020 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: *2164397020
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: &2164389560 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.4'
+  type: :runtime
+  prerelease: false
+  version_requirements: *2164389560
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: &2164386120 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.14'
+  type: :runtime
+  prerelease: false
+  version_requirements: *2164386120
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  requirement: &2164383920 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '3.2'
+  type: :runtime
+  prerelease: false
+  version_requirements: *2164383920
+- !ruby/object:Gem::Dependency
+  name: i18n
+  requirement: &2164379240 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - =
+      - !ruby/object:Gem::Version
+        version: 0.6.0
+  type: :runtime
+  prerelease: false
+  version_requirements: *2164379240
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &2164377600 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.8.0
+  type: :development
+  prerelease: false
+  version_requirements: *2164377600
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: &2164371760 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.0
+  type: :development
+  prerelease: false
+  version_requirements: *2164371760
+- !ruby/object:Gem::Dependency
+  name: jeweler
+  requirement: &2164363740 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.5.2
+  type: :development
+  prerelease: false
+  version_requirements: *2164363740
+- !ruby/object:Gem::Dependency
+  name: rcov
+  requirement: &2164353760 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2164353760
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: &2164351900 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2164351900
+- !ruby/object:Gem::Dependency
+  name: fakeweb
+  requirement: &2164350260 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - =
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+  type: :development
+  prerelease: false
+  version_requirements: *2164350260
+- !ruby/object:Gem::Dependency
+  name: ruby-debug
+  requirement: &2164348400 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - =
+      - !ruby/object:Gem::Version
+        version: 0.10.4
+  type: :development
+  prerelease: false
+  version_requirements: *2164348400
+- !ruby/object:Gem::Dependency
+  name: ruby-debug19
+  requirement: &2164347200 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - =
+      - !ruby/object:Gem::Version
+        version: 0.11.6
+  type: :development
+  prerelease: false
+  version_requirements: *2164347200
+description: A web scraping framework based on Mechanize using the Dirt pattern. See
+  https://github.com/jtzemp/rextract for more details.
+email: jtzemp@gmail.com
+executables:
+- rextract
+extensions: []
+extra_rdoc_files:
+- README.markdown
+files:
+- .document
+- .rspec
+- Gemfile
+- Gemfile.lock
+- README.markdown
+- Rakefile
+- VERSION
+- bin/rextract
+- lib/rextract.rb
+- lib/rextract/browser.rb
+- lib/rextract/parser.rb
+- lib/rextract/spider.rb
+- rextract.gemspec
+- spec/rextract/browser_spec.rb
+- spec/rextract/parser_spec.rb
+- spec/rextract/spider_spec.rb
+- spec/rextract_spec.rb
+- spec/spec_helper.rb
+- templates/app/run
+- templates/job/%job_name%.rb.tt
+- templates/job/parser.rb.tt
+- templates/job/spider.rb.tt
+- templates/test.thor
+homepage: http://github.com/jtzemp/rextract
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: -3619612058136525491
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.11
+signing_key:
+specification_version: 3
+summary: A web scraping framework based on Mechanize using the Dirt pattern. See https://github.com/jtzemp/rextract
+  for more details.
+test_files:
+- spec/rextract/browser_spec.rb
+- spec/rextract/parser_spec.rb
+- spec/rextract/spider_spec.rb
+- spec/rextract_spec.rb
+- spec/spec_helper.rb