rextract 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color -f d
data/Gemfile ADDED
@@ -0,0 +1,20 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "mechanize", '~> 2.3'
4
+ gem 'nokogiri', '~> 1.4'
5
+ gem "thor", "~> 0.14"
6
+ gem 'activesupport', '~> 3.2'
7
+ gem 'i18n', '0.6.0'
8
+
9
+ # Add dependencies to develop your gem here.
10
+ # Include everything needed to run rake, tests, features, etc.
11
+ group :development do
12
+ gem "rspec", "~> 2.8.0"
13
+ gem "bundler", "~> 1.0.0"
14
+ gem "jeweler", "~> 1.5.2"
15
+ gem "rcov", ">= 0", :platforms => :mri_18
16
+ gem 'simplecov', :require => false, :platforms => :mri_19
17
+ gem "fakeweb", "1.3.0"
18
+ gem "ruby-debug", "0.10.4", :platforms => :mri_18
19
+ gem "ruby-debug19", "0.11.6", :platforms => :mri_19
20
+ end
@@ -0,0 +1,89 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ activesupport (3.2.2)
5
+ i18n (~> 0.6)
6
+ multi_json (~> 1.0)
7
+ archive-tar-minitar (0.5.2)
8
+ columnize (0.3.6)
9
+ diff-lcs (1.1.3)
10
+ domain_name (0.5.2)
11
+ unf (~> 0.0.3)
12
+ fakeweb (1.3.0)
13
+ git (1.2.5)
14
+ i18n (0.6.0)
15
+ jeweler (1.5.2)
16
+ bundler (~> 1.0.0)
17
+ git (>= 1.2.5)
18
+ rake
19
+ linecache (0.46)
20
+ rbx-require-relative (> 0.0.4)
21
+ linecache19 (0.5.12)
22
+ ruby_core_source (>= 0.1.4)
23
+ mechanize (2.3)
24
+ domain_name (~> 0.5, >= 0.5.1)
25
+ mime-types (~> 1.17, >= 1.17.2)
26
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
27
+ net-http-persistent (~> 2.5, >= 2.5.2)
28
+ nokogiri (~> 1.4)
29
+ ntlm-http (~> 0.1, >= 0.1.1)
30
+ webrobots (~> 0.0, >= 0.0.9)
31
+ mime-types (1.17.2)
32
+ multi_json (1.1.0)
33
+ net-http-digest_auth (1.2)
34
+ net-http-persistent (2.5.2)
35
+ nokogiri (1.5.2)
36
+ ntlm-http (0.1.1)
37
+ rake (0.9.2.2)
38
+ rbx-require-relative (0.0.9)
39
+ rcov (1.0.0)
40
+ rspec (2.8.0)
41
+ rspec-core (~> 2.8.0)
42
+ rspec-expectations (~> 2.8.0)
43
+ rspec-mocks (~> 2.8.0)
44
+ rspec-core (2.8.0)
45
+ rspec-expectations (2.8.0)
46
+ diff-lcs (~> 1.1.2)
47
+ rspec-mocks (2.8.0)
48
+ ruby-debug (0.10.4)
49
+ columnize (>= 0.1)
50
+ ruby-debug-base (~> 0.10.4.0)
51
+ ruby-debug-base (0.10.4)
52
+ linecache (>= 0.3)
53
+ ruby-debug-base19 (0.11.25)
54
+ columnize (>= 0.3.1)
55
+ linecache19 (>= 0.5.11)
56
+ ruby_core_source (>= 0.1.4)
57
+ ruby-debug19 (0.11.6)
58
+ columnize (>= 0.3.1)
59
+ linecache19 (>= 0.5.11)
60
+ ruby-debug-base19 (>= 0.11.19)
61
+ ruby_core_source (0.1.5)
62
+ archive-tar-minitar (>= 0.5.2)
63
+ simplecov (0.6.1)
64
+ multi_json (~> 1.0)
65
+ simplecov-html (~> 0.5.3)
66
+ simplecov-html (0.5.3)
67
+ thor (0.14.6)
68
+ unf (0.0.5)
69
+ unf_ext
70
+ unf_ext (0.0.4)
71
+ webrobots (0.0.13)
72
+
73
+ PLATFORMS
74
+ ruby
75
+
76
+ DEPENDENCIES
77
+ activesupport (~> 3.2)
78
+ bundler (~> 1.0.0)
79
+ fakeweb (= 1.3.0)
80
+ i18n (= 0.6.0)
81
+ jeweler (~> 1.5.2)
82
+ mechanize (~> 2.3)
83
+ nokogiri (~> 1.4)
84
+ rcov
85
+ rspec (~> 2.8.0)
86
+ ruby-debug (= 0.10.4)
87
+ ruby-debug19 (= 0.11.6)
88
+ simplecov
89
+ thor (~> 0.14)
@@ -0,0 +1,56 @@
1
+ # rextract
2
+
3
+ A web scraping framework using Matt Thorley's [Dirt](https://github.com/mthorley/dirt) pattern.
4
+
5
+ The idea is that you
6
+
7
+ gem install rextract
8
+ rextract project new-scraping-project -j job1,job2
9
+
10
+ and it'll create a new scraping project for you and create a template of scripts, spiders and parsers so you can quickly create scraping projects. The -j option creates a job folder with a spider and a parser for each named, comma-separated job.
11
+
12
+ ## Concepts
13
+
14
+ ### Spiders
15
+
16
+ Spiders inherit from Rextract::Browser which is essentially Mechanize with some helpers. You use Mechanize's methods to get, post and do all the fancy browsing and create some methods to return the body content you want for given pages. Anything more complicated than a couple of xpaths or css selectors you should stick in a Parser.
17
+
18
+ The spider saves all the content scraped to disk by default in directories named by url and timestamp. This makes it so you can alter parsers w/o having to re-download the content.
19
+
20
+ ### Parsers
21
+
22
+ Parsers inherit from Rextract::Parser. You define methods with the prefix 'parse_' and they will all get called automagically and their data will return as a hash of results. When you create a new Parser object, pass in the body content into the .new() method, then call #parse on the object.
23
+
24
+ ## Contributing to rextract
25
+
26
+ * Contributions are super much appreciated!
27
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
28
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
29
+ * Fork the project
30
+ * Start a feature/bugfix branch
31
+ * Commit and push until you are happy with your contribution
32
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
33
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
34
+
35
+ ## Copyright and License
36
+
37
+ Copyright (c) 2011 JT Zemp and contributors. Licensed under MIT.
38
+
39
+ Permission is hereby granted, free of charge, to any person obtaining
40
+ a copy of this software and associated documentation files (the
41
+ "Software"), to deal in the Software without restriction, including
42
+ without limitation the rights to use, copy, modify, merge, publish,
43
+ distribute, sublicense, and/or sell copies of the Software, and to
44
+ permit persons to whom the Software is furnished to do so, subject to
45
+ the following conditions:
46
+
47
+ The above copyright notice and this permission notice shall be
48
+ included in all copies or substantial portions of the Software.
49
+
50
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
51
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
52
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
53
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
54
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
55
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
56
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,47 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "rextract"
16
+ gem.homepage = "http://github.com/jtzemp/rextract"
17
+ gem.license = "MIT"
18
+ summary = %q{A web scraping framework based on Mechanize using the Dirt pattern. See https://github.com/jtzemp/rextract for more details.}
19
+ gem.summary = summary
20
+ gem.description = summary
21
+ gem.email = "jtzemp@gmail.com"
22
+ gem.authors = ["JT Zemp"]
23
+ end
24
+ Jeweler::RubygemsDotOrgTasks.new
25
+
26
+ require 'rspec/core'
27
+ require 'rspec/core/rake_task'
28
+ RSpec::Core::RakeTask.new(:spec) do |spec|
29
+ spec.pattern = FileList['spec/**/*_spec.rb']
30
+ end
31
+
32
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
33
+ spec.pattern = 'spec/**/*_spec.rb'
34
+ spec.rcov = true
35
+ end
36
+
37
+ task :default => :spec
38
+
39
+ require 'rake/rdoctask'
40
+ Rake::RDocTask.new do |rdoc|
41
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
+
43
+ rdoc.rdoc_dir = 'rdoc'
44
+ rdoc.title = "rextract #{version}"
45
+ rdoc.rdoc_files.include('README*')
46
+ rdoc.rdoc_files.include('lib/**/*.rb')
47
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.5
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ require 'rubygems'
4
+ require 'thor'
5
+ require 'thor/group'
6
+ require 'active_support/all'
7
+
8
+ class RextractApp < Thor
9
+ include Thor::Actions
10
+
11
+ attr_accessor :job_name
12
+
13
+ def self.source_root
14
+ File.dirname(__FILE__) + "/../templates"
15
+ end
16
+
17
+ desc "project", "creates a new rextract project and optionally a number of jobs by passing -j job1,job2,job3"
18
+ method_option :jobs, :aliases => '-j', :desc => "create named jobs as well", :default => ""
19
+ def project(name)
20
+ jobs = options[:jobs]
21
+ jobs = jobs.split(",").map(&:strip)
22
+ puts "create project #{name}"
23
+ empty_directory(name)
24
+ template('app/run', "#{name}/run")
25
+ chmod("#{name}/run", 0744)
26
+
27
+ jobs.each do |job_name|
28
+ create_job(name, job_name)
29
+ end
30
+ end
31
+
32
+
33
+ def create_job(project_name, job_name)
34
+ @job_name = job_name # hack for template parsing and interpolation
35
+ puts "creating #{job_name} job"
36
+ directory("job", "#{project_name}/#{job_name}")
37
+ end
38
+
39
+ desc "job", "creates a new job"
40
+ def job(job_name)
41
+ create_job(job_name)
42
+ end
43
+ end
44
+
45
+ RextractApp.start
@@ -0,0 +1,6 @@
1
+ module Rextract
2
+ require 'rextract/browser'
3
+ require 'rextract/spider'
4
+ require 'rextract/parser'
5
+ VERSION = File.open(File.expand_path(File.dirname(__FILE__)) + "/../VERSION").read
6
+ end
@@ -0,0 +1,72 @@
1
+ require 'rextract'
2
+ require 'mechanize'
3
+ require 'time'
4
+
5
+ module Rextract
6
+ module ArchiveResponse
7
+
8
+ def initialize(*args)
9
+ @archive_dir=nil
10
+ super(*args)
11
+ end
12
+
13
+ def default_archive_dir(base = "~/tmp/")
14
+ File.expand_path(base + Time.now.strftime("%Y-%m-%d_%H-%M-%S/"))
15
+ end
16
+
17
+ def archive_dir
18
+ @archive_dir || (archive_dir = default_archive_dir)
19
+ end
20
+
21
+ def ensure_dir(dir_path)
22
+ FileUtils.mkdir_p(dir_path) unless File.exists?(dir_path)
23
+ end
24
+
25
+ def archive_dir=(dir_path)
26
+ dir = File.expand_path(dir_path)
27
+ ensure_dir(dir_path)
28
+ @archive_dir = dir
29
+ end
30
+
31
+ def get(*args)
32
+ url = args.is_a?(Hash) ? args[:url] : args.first
33
+ body_path = "#{archive_dir}/#{sanitize_url(url)}.html"
34
+ header_path = "#{archive_dir}/#{sanitize_url(url)}.headers"
35
+ ensure_dir(archive_dir)
36
+
37
+ page = super(*args)
38
+
39
+ write_to_file(body_path, page.body.to_s)
40
+
41
+ header_output = ''
42
+ PP.pp(page.header, header_output)
43
+
44
+ write_to_file(header_path, header_output)
45
+
46
+ page
47
+ end
48
+
49
+ def write_to_file(path, data)
50
+ File.open(path, "w+") do |f|
51
+ f.write(data)
52
+ end
53
+ end
54
+
55
+ def sanitize_url(url)
56
+ url.gsub(/[^A-z0-9_\-\.]/, "_")
57
+ end
58
+ end
59
+
60
+ module LogRequests
61
+ def get(*args)
62
+ url = args.is_a?(Hash) ? args[:url] : args.first
63
+ $stderr.puts "[#{Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")}] Requesting URL #{url}" # TODO Switch the STDERR.puts call with something that uses a _real_ logger
64
+ super(*args)
65
+ end
66
+ end
67
+
68
+ class Browser < Mechanize
69
+ include ArchiveResponse
70
+ include LogRequests
71
+ end
72
+ end
@@ -0,0 +1,38 @@
1
+ require 'rextract'
2
+ require 'nokogiri'
3
+
4
+ class Rextract::Parser
5
+ attr_reader :content, :doc, :opts
6
+
7
+ def initialize(content, *opts)
8
+ @opts = opts
9
+ case content
10
+ when content.respond_to?(:parser) && content.respond_to?(:html_body)
11
+ @doc = content.parser
12
+ @content = content.html_body
13
+ when content.respond_to?(:body)
14
+ @doc = content
15
+ @content = content.body
16
+ else
17
+ @doc = Nokogiri::HTML(content.to_s)
18
+ @content = content.to_s
19
+ end
20
+ end
21
+
22
+ def extract(regex)
23
+ @content.scan(regex).flatten.first
24
+ end
25
+
26
+ def parsing_methods
27
+ (self.methods - self.class.superclass.methods).collect{|m| m if m =~ /parse_/}.compact
28
+ end
29
+
30
+ def parse
31
+ result = {}
32
+ parsing_methods.each do |method|
33
+ key = method.to_s.scan(/parse_(\w*)/).flatten.first.to_sym
34
+ result[key] = self.send(method.to_sym)
35
+ end
36
+ result
37
+ end
38
+ end
@@ -0,0 +1,17 @@
1
+ require 'rextract'
2
+
3
+ class Rextract::Spider
4
+ attr_reader :agent
5
+ def initialize(opts = {})
6
+ @agent = Rextract::Browser.new do |a|
7
+ a.user_agent_alias = opts[:user_agent_alias] || 'Mac Safari'
8
+ a.follow_meta_refresh = true
9
+ a.auth(opts[:user], opts[:password]) if opts.has_key?(:user) && opts.has_key?(:password)
10
+ opts.each do |key, val|
11
+ method = "#{key}="
12
+ a.send(method, val) if a.respond_to?(method)
13
+ end
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,105 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "rextract"
8
+ s.version = "0.1.5"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["JT Zemp"]
12
+ s.date = "2012-03-12"
13
+ s.description = "A web scraping framework based on Mechanize using the Dirt pattern. See https://github.com/jtzemp/rextract for more details."
14
+ s.email = "jtzemp@gmail.com"
15
+ s.executables = ["rextract"]
16
+ s.extra_rdoc_files = [
17
+ "README.markdown"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "README.markdown",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "bin/rextract",
28
+ "lib/rextract.rb",
29
+ "lib/rextract/browser.rb",
30
+ "lib/rextract/parser.rb",
31
+ "lib/rextract/spider.rb",
32
+ "rextract.gemspec",
33
+ "spec/rextract/browser_spec.rb",
34
+ "spec/rextract/parser_spec.rb",
35
+ "spec/rextract/spider_spec.rb",
36
+ "spec/rextract_spec.rb",
37
+ "spec/spec_helper.rb",
38
+ "templates/app/run",
39
+ "templates/job/%job_name%.rb.tt",
40
+ "templates/job/parser.rb.tt",
41
+ "templates/job/spider.rb.tt",
42
+ "templates/test.thor"
43
+ ]
44
+ s.homepage = "http://github.com/jtzemp/rextract"
45
+ s.licenses = ["MIT"]
46
+ s.require_paths = ["lib"]
47
+ s.rubygems_version = "1.8.11"
48
+ s.summary = "A web scraping framework based on Mechanize using the Dirt pattern. See https://github.com/jtzemp/rextract for more details."
49
+ s.test_files = [
50
+ "spec/rextract/browser_spec.rb",
51
+ "spec/rextract/parser_spec.rb",
52
+ "spec/rextract/spider_spec.rb",
53
+ "spec/rextract_spec.rb",
54
+ "spec/spec_helper.rb"
55
+ ]
56
+
57
+ if s.respond_to? :specification_version then
58
+ s.specification_version = 3
59
+
60
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
61
+ s.add_runtime_dependency(%q<mechanize>, ["~> 2.3"])
62
+ s.add_runtime_dependency(%q<nokogiri>, ["~> 1.4"])
63
+ s.add_runtime_dependency(%q<thor>, ["~> 0.14"])
64
+ s.add_runtime_dependency(%q<activesupport>, ["~> 3.2"])
65
+ s.add_runtime_dependency(%q<i18n>, ["= 0.6.0"])
66
+ s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
67
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
68
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
69
+ s.add_development_dependency(%q<rcov>, [">= 0"])
70
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
71
+ s.add_development_dependency(%q<fakeweb>, ["= 1.3.0"])
72
+ s.add_development_dependency(%q<ruby-debug>, ["= 0.10.4"])
73
+ s.add_development_dependency(%q<ruby-debug19>, ["= 0.11.6"])
74
+ else
75
+ s.add_dependency(%q<mechanize>, ["~> 2.3"])
76
+ s.add_dependency(%q<nokogiri>, ["~> 1.4"])
77
+ s.add_dependency(%q<thor>, ["~> 0.14"])
78
+ s.add_dependency(%q<activesupport>, ["~> 3.2"])
79
+ s.add_dependency(%q<i18n>, ["= 0.6.0"])
80
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
81
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
82
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
83
+ s.add_dependency(%q<rcov>, [">= 0"])
84
+ s.add_dependency(%q<simplecov>, [">= 0"])
85
+ s.add_dependency(%q<fakeweb>, ["= 1.3.0"])
86
+ s.add_dependency(%q<ruby-debug>, ["= 0.10.4"])
87
+ s.add_dependency(%q<ruby-debug19>, ["= 0.11.6"])
88
+ end
89
+ else
90
+ s.add_dependency(%q<mechanize>, ["~> 2.3"])
91
+ s.add_dependency(%q<nokogiri>, ["~> 1.4"])
92
+ s.add_dependency(%q<thor>, ["~> 0.14"])
93
+ s.add_dependency(%q<activesupport>, ["~> 3.2"])
94
+ s.add_dependency(%q<i18n>, ["= 0.6.0"])
95
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
96
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
97
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
98
+ s.add_dependency(%q<rcov>, [">= 0"])
99
+ s.add_dependency(%q<simplecov>, [">= 0"])
100
+ s.add_dependency(%q<fakeweb>, ["= 1.3.0"])
101
+ s.add_dependency(%q<ruby-debug>, ["= 0.10.4"])
102
+ s.add_dependency(%q<ruby-debug19>, ["= 0.11.6"])
103
+ end
104
+ end
105
+
@@ -0,0 +1,94 @@
1
+ require 'spec_helper'
2
+
3
+ class SpecArchiveWrapperParent
4
+ def get(*args)
5
+ OpenStruct.new(body: "body", header: [{ "header key" => "header value"}])
6
+ end
7
+ end
8
+
9
+ class SpecArchiveWrapper < SpecArchiveWrapperParent
10
+ include Rextract::ArchiveResponse
11
+ end
12
+
13
+
14
+ describe Rextract::ArchiveResponse do
15
+ before(:each) do
16
+ @alphabet = ("a".."z").to_a + ("A".."Z").to_a
17
+ @random_dir = Dir.tmpdir + "/" + @alphabet.shuffle[rand(@archive_dir),9].shuffle.join
18
+ end
19
+
20
+ after(:each) do
21
+ FileUtils.rm_rf(@random_dir)
22
+ end
23
+
24
+ describe "#archive_dir=" do
25
+ it "creates the directory (and it's parent directories) if it doesn't exist" do
26
+ rar = SpecArchiveWrapper.new
27
+ File.should_not be_exists(@random_dir)
28
+ rar.archive_dir = @random_dir
29
+ rar.archive_dir.should == @random_dir
30
+ File.should be_exists(@random_dir)
31
+ end
32
+ end
33
+
34
+ describe "#get" do
35
+ it "wraps super(*args) and saves the output to $archive_dir/$something" do
36
+ rar = SpecArchiveWrapper.new
37
+ rar.archive_dir = @random_dir
38
+
39
+ File.should_not be_exists("#{@random_dir}/a.html")
40
+ page = rar.get('a')
41
+ page.body.should == "body"
42
+ page.header.should == [{"header key"=>"header value"}]
43
+ File.should be_exists("#{@random_dir}/a.html")
44
+ end
45
+ end
46
+
47
+ describe "#default_archive_dir" do
48
+ it "takes an optional base param and returns a string representing the base path and the current timestamp" do
49
+ SpecArchiveWrapper.new.default_archive_dir.should match /^\/.*\/tmp\/\d{4}+-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}/
50
+ end
51
+ end
52
+
53
+ describe "#write_to_file" do
54
+ it "takes a path, and a string for data and writes the data to the path" do
55
+ path = "#{@random_dir}/test.txt"
56
+ File.should_not be_exists(path)
57
+ FileUtils.mkdir_p(@random_dir)
58
+ SpecArchiveWrapper.new.write_to_file(path, "test")
59
+ File.should be_exists(path)
60
+ end
61
+ end
62
+
63
+ end
64
+
65
+ describe Rextract::LogRequests do
66
+ before(:all) do
67
+ class SpecLogRequestsParent
68
+ def get(*args)
69
+ args
70
+ end
71
+ end
72
+
73
+ class SpecLogRequests < SpecLogRequestsParent
74
+ include Rextract::LogRequests
75
+ end
76
+ end
77
+
78
+ before(:each) do
79
+ $stderr = StringIO.new
80
+ end
81
+
82
+ after(:each) do
83
+ $stderr = STDERR
84
+ end
85
+
86
+ describe "#get" do
87
+ it "should log the get request URL to STDERR" do
88
+ slp = SpecLogRequests.new
89
+ slp.get("a", "b", "c").should == ["a", "b", "c"]
90
+ $stderr.rewind
91
+ $stderr.read.should match(/\[\d+-\d+-\d+\ \d+:\d+:\d+\.\d{3}] Requesting URL a/)
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+
3
+ describe Rextract::Parser do
4
+ before(:each) do
5
+ @html =<<EOHTML
6
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
7
+ "http://www.w3.org/TR/html4/loose.dtd">
8
+
9
+ <html lang="en">
10
+ <head>
11
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
12
+ <title>test html</title>
13
+ <meta name="generator" content="TextMate http://macromates.com/">
14
+ <meta name="author" content="Author Name">
15
+ <!-- Date: 2011-01-19 -->
16
+ </head>
17
+ <body>
18
+ <div id="content"><div id="left_nav">navbar</div></div>
19
+ </body>
20
+ </html>
21
+ EOHTML
22
+ end
23
+ describe ".new" do
24
+ it "takes a string of HTML and loads it into Nokogiri" do
25
+ p = Rextract::Parser.new(@html)
26
+ p.doc.should be_a(Nokogiri::HTML::Document)
27
+ p.doc.css("#left_nav").inner_text.should == "navbar"
28
+ end
29
+ end
30
+
31
+ describe "#extract" do
32
+ it "takes a regular expression and returns the first match" do
33
+ p = Rextract::Parser.new(@html)
34
+ p.extract(%r{<meta name="author" content="(.+)"}).should == "Author Name"
35
+ end
36
+ end
37
+
38
+ describe "#parsing_methods" do
39
+ before(:all) do
40
+ class SampleParser < Rextract::Parser
41
+ def parse_generator ; return 'parse_generator'; end
42
+ def parse_nothing ; return 'parse_nothing'; end
43
+ def parse_lang ; return 'parse_lang'; end
44
+ end
45
+ end
46
+ it "returns an array of methods that start with 'parse_' " do
47
+ p = SampleParser.new(@html)
48
+ p.parsing_methods.sort.should == [:parse_lang, :parse_generator, :parse_nothing].sort
49
+ end
50
+ end
51
+
52
+ describe "#parse" do
53
+ it "returns a hash of all the output from the parse_* methods" do
54
+ p = SampleParser.new(@html)
55
+ p.parse.should == {:generator=>"parse_generator", :nothing=>"parse_nothing", :lang=>"parse_lang"}
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,46 @@
1
+ require 'spec_helper'
2
+
3
+ describe Rextract::Spider do
4
+ describe "#new" do
5
+ describe "options hash tries to set all options on the Mechanize agent" do
6
+ describe ":user_agent_alias" do
7
+ it "accepts :ua_alias and passes it to the Mechanize agent" do
8
+ spider = Rextract::Spider.new(:user_agent_alias => "Mac FireFox")
9
+ spider.agent.user_agent.should == "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6"
10
+ end
11
+
12
+ it ":ua_alias defaults to 'Mac Safari'" do
13
+ spider = Rextract::Spider.new
14
+ spider.agent.user_agent.should == "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
15
+ end
16
+ end
17
+
18
+ describe ":request_headers" do
19
+ it "accepts :request_headers as a Hash" do
20
+ spider = Rextract::Spider.new(:request_headers => {"Funky" => "Chicken"})
21
+ spider.agent.request_headers.should == {"Funky" => "Chicken"}
22
+ end
23
+
24
+ it "defaults to none" do
25
+ spider = Rextract::Spider.new
26
+ spider.agent.request_headers.should == {}
27
+ end
28
+ end
29
+
30
+ describe "simple authentication" do
31
+ it "attempts simple authentication if opts[:user] and opts[:password] are set" do
32
+ spider = Rextract::Spider.new(:user => 'goofball-user', :password => 'goofball-password')
33
+ spider.agent.inspect.should match(/@user=\"goofball-user\"/)
34
+ spider.agent.inspect.should match(/@password=\"goofball-password\"/)
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ describe "#agent" do
41
+ it "returns a Mechanize Agent" do
42
+ spider = Rextract::Spider.new
43
+ spider.agent.should be_a(Mechanize)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,7 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Rextract" do
4
+ it "doesn't fail" do
5
+ true.should == true
6
+ end
7
+ end
@@ -0,0 +1,20 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'rextract'
5
+
6
+ require 'rextract/browser'
7
+ require 'ruby-debug'
8
+
9
+ require 'tempfile'
10
+ require 'fileutils'
11
+ require 'stringio'
12
+ require 'ostruct'
13
+
14
+ # Requires supporting files with custom matchers and macros, etc,
15
+ # in ./support/ and its subdirectories.
16
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
17
+
18
+ RSpec.configure do |config|
19
+
20
+ end
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # parse ARGV's
4
+
5
+ # run the scraper(s) / parser(s) to get the data you want
@@ -0,0 +1,4 @@
1
+ module <%= job_name.classify %>
2
+ require '<%= job_name.classify.underscore %>/spider'
3
+ require '<%= job_name.classify.underscore %>/parser'
4
+ end
@@ -0,0 +1,25 @@
1
+ $LOAD_PATH << File.expand_path(File.dirname(__FILE__)) + "/.."
2
+ require 'rextract'
3
+
4
+ # The Parser classname can be named related to what's being parsed
5
+ class <%= job_name.classify %>::Parser < Rextract::Parser
6
+
7
+ # create methods prepended with parse_* to have them recognized and run with #parse
8
+ # def parse_something
9
+ #
10
+ # end
11
+ #
12
+ # def parse_something_else
13
+ #
14
+ # end
15
+
16
+ end
17
+
18
+ if __FILE__ == $0
19
+ results = []
20
+ ARGV.each do |file|
21
+ results << <%= job_name.classify %>::Parser.new(File.read(file)).parse
22
+ end
23
+
24
+ STDERR.puts results.inspect
25
+ end
@@ -0,0 +1,45 @@
1
+ require '<%= @job_name %>'
2
+ require 'rextract'
3
+
4
+ class <%= @job_name.classify %>::Crawler < Rextract::Crawler
5
+
6
+ def login
7
+ signin_url = "http://www.example.com/login.php"
8
+ @agent.get(signin_url)
9
+ # set your own user and pass in private_data.rb
10
+ resp = @agent.post(signin_url, {:EmailUsername => config[:user], :Password => $pass, :SignIn => 'Log+In'})
11
+ @logged_in = true
12
+ end
13
+
14
+ def crawl(email)
15
+ login if !@logged_in
16
+ id = get_member_id(email)
17
+ return nil if !id
18
+ result = {
19
+ :id => id,
20
+ :public_profile => get_public_profile(id),
21
+ :friend_list => get_friend_list(id)
22
+ }
23
+ end
24
+
25
+ def get_member_id(email)
26
+ search_url = "http://www.bebo.com/c/search?SearchTerm=%s"
27
+ page = @agent.get search_url % email.strip
28
+ page ? page.body.scan(/AddAsFriendBox.forMemberId\((\d*)\)/).flatten.first : nil
29
+ end
30
+
31
+ def get_public_profile(id)
32
+ profile_url = "http://www.bebo.com/Profile.jsp?MemberId=%s" % id
33
+ page = @agent.get profile_url % id
34
+ raise "Login Required" if page.body =~ LOGIN_REQUIRED
35
+ return nil if page.body =~ PRIVATE_PROFILE
36
+ page.body
37
+ end
38
+
39
+ def get_friend_list(id)
40
+ friends_url = "http://bebo.com/FriendList.jsp?MemberId=%s" % id
41
+ page = @agent.get friends_url % id
42
+ return nil if page.body =~ PRIVATE_PROFILE
43
+ page.body
44
+ end
45
+ end
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'thor/group'
4
+ require 'thor/actions'
5
+ require 'ruby-debug'
6
+ require 'active_support/inflector'
7
+
8
+
9
+ class Thortest < Thor::Group
10
+ include Thor::Actions
11
+
12
+ # Define arguments and options
13
+ argument :job_name
14
+ #class_option :test_framework, :default => :test_unit
15
+
16
+ def self.source_root
17
+ r = File.dirname(__FILE__)
18
+ STDERR.puts "source_root: #{r}"
19
+ r
20
+ end
21
+
22
+ def self.dest_root
23
+ STDERR.puts "dest_root: #{Dir.pwd}"
24
+ Dir.pwd
25
+ end
26
+
27
+ def wookie
28
+ source = "job"
29
+ destination = "./"
30
+ STDERR.puts "copying #{source} to #{destination}"
31
+ directory(source, destination)
32
+ end
33
+
34
+ # def create_lib_file
35
+ # template('templates/newgem.tt', "#{name}/lib/#{name}.rb")
36
+ # end
37
+ #
38
+ # def create_test_file
39
+ # test = options[:test_framework] == "rspec" ? :spec : :test
40
+ # create_file "#{name}/#{test}/#{name}_#{test}.rb"
41
+ # end
42
+ #
43
+ # def copy_licence
44
+ # if yes?("Use MIT license?")
45
+ # # Make a copy of the MITLICENSE file at the source root
46
+ # copy_file "MITLICENSE", "#{name}/MITLICENSE"
47
+ # else
48
+ # say "Shame on you...", :red
49
+ # end
50
+ # end
51
+ end
52
+ puts "monkey at the end, but before start"
53
+
54
+ #tt = Thortest.new(:name=>"funkd")
55
+ Thortest.start
metadata ADDED
@@ -0,0 +1,223 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rextract
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.5
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - JT Zemp
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-12 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: &2164397020 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.3'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *2164397020
25
+ - !ruby/object:Gem::Dependency
26
+ name: nokogiri
27
+ requirement: &2164389560 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: '1.4'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *2164389560
36
+ - !ruby/object:Gem::Dependency
37
+ name: thor
38
+ requirement: &2164386120 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: '0.14'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *2164386120
47
+ - !ruby/object:Gem::Dependency
48
+ name: activesupport
49
+ requirement: &2164383920 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '3.2'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *2164383920
58
+ - !ruby/object:Gem::Dependency
59
+ name: i18n
60
+ requirement: &2164379240 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - =
64
+ - !ruby/object:Gem::Version
65
+ version: 0.6.0
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *2164379240
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: &2164377600 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 2.8.0
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *2164377600
80
+ - !ruby/object:Gem::Dependency
81
+ name: bundler
82
+ requirement: &2164371760 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ~>
86
+ - !ruby/object:Gem::Version
87
+ version: 1.0.0
88
+ type: :development
89
+ prerelease: false
90
+ version_requirements: *2164371760
91
+ - !ruby/object:Gem::Dependency
92
+ name: jeweler
93
+ requirement: &2164363740 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ~>
97
+ - !ruby/object:Gem::Version
98
+ version: 1.5.2
99
+ type: :development
100
+ prerelease: false
101
+ version_requirements: *2164363740
102
+ - !ruby/object:Gem::Dependency
103
+ name: rcov
104
+ requirement: &2164353760 !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: *2164353760
113
+ - !ruby/object:Gem::Dependency
114
+ name: simplecov
115
+ requirement: &2164351900 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ type: :development
122
+ prerelease: false
123
+ version_requirements: *2164351900
124
+ - !ruby/object:Gem::Dependency
125
+ name: fakeweb
126
+ requirement: &2164350260 !ruby/object:Gem::Requirement
127
+ none: false
128
+ requirements:
129
+ - - =
130
+ - !ruby/object:Gem::Version
131
+ version: 1.3.0
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: *2164350260
135
+ - !ruby/object:Gem::Dependency
136
+ name: ruby-debug
137
+ requirement: &2164348400 !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - =
141
+ - !ruby/object:Gem::Version
142
+ version: 0.10.4
143
+ type: :development
144
+ prerelease: false
145
+ version_requirements: *2164348400
146
+ - !ruby/object:Gem::Dependency
147
+ name: ruby-debug19
148
+ requirement: &2164347200 !ruby/object:Gem::Requirement
149
+ none: false
150
+ requirements:
151
+ - - =
152
+ - !ruby/object:Gem::Version
153
+ version: 0.11.6
154
+ type: :development
155
+ prerelease: false
156
+ version_requirements: *2164347200
157
+ description: A web scraping framework based on Mechanize using the Dirt pattern. See
158
+ https://github.com/jtzemp/rextract for more details.
159
+ email: jtzemp@gmail.com
160
+ executables:
161
+ - rextract
162
+ extensions: []
163
+ extra_rdoc_files:
164
+ - README.markdown
165
+ files:
166
+ - .document
167
+ - .rspec
168
+ - Gemfile
169
+ - Gemfile.lock
170
+ - README.markdown
171
+ - Rakefile
172
+ - VERSION
173
+ - bin/rextract
174
+ - lib/rextract.rb
175
+ - lib/rextract/browser.rb
176
+ - lib/rextract/parser.rb
177
+ - lib/rextract/spider.rb
178
+ - rextract.gemspec
179
+ - spec/rextract/browser_spec.rb
180
+ - spec/rextract/parser_spec.rb
181
+ - spec/rextract/spider_spec.rb
182
+ - spec/rextract_spec.rb
183
+ - spec/spec_helper.rb
184
+ - templates/app/run
185
+ - templates/job/%job_name%.rb.tt
186
+ - templates/job/parser.rb.tt
187
+ - templates/job/spider.rb.tt
188
+ - templates/test.thor
189
+ homepage: http://github.com/jtzemp/rextract
190
+ licenses:
191
+ - MIT
192
+ post_install_message:
193
+ rdoc_options: []
194
+ require_paths:
195
+ - lib
196
+ required_ruby_version: !ruby/object:Gem::Requirement
197
+ none: false
198
+ requirements:
199
+ - - ! '>='
200
+ - !ruby/object:Gem::Version
201
+ version: '0'
202
+ segments:
203
+ - 0
204
+ hash: -3619612058136525491
205
+ required_rubygems_version: !ruby/object:Gem::Requirement
206
+ none: false
207
+ requirements:
208
+ - - ! '>='
209
+ - !ruby/object:Gem::Version
210
+ version: '0'
211
+ requirements: []
212
+ rubyforge_project:
213
+ rubygems_version: 1.8.11
214
+ signing_key:
215
+ specification_version: 3
216
+ summary: A web scraping framework based on Mechanize using the Dirt pattern. See https://github.com/jtzemp/rextract
217
+ for more details.
218
+ test_files:
219
+ - spec/rextract/browser_spec.rb
220
+ - spec/rextract/parser_spec.rb
221
+ - spec/rextract/spider_spec.rb
222
+ - spec/rextract_spec.rb
223
+ - spec/spec_helper.rb