flyerhzm-regexp_crawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/LICENSE +0 -0
- data/README +1 -0
- data/Rakefile +21 -0
- data/TODO +0 -0
- data/VERSION +1 -0
- data/init.rb +3 -0
- data/lib/regexp_crawler/crawler.rb +54 -0
- data/lib/regexp_crawler.rb +23 -0
- data/spec/regexp_crawler_spec.rb +56 -0
- data/spec/resources/complex.html +11 -0
- data/spec/resources/nested1.html +12 -0
- data/spec/resources/nested2.html +12 -0
- data/spec/resources/simple.html +12 -0
- data/spec/spec.opts +8 -0
- data/spec/spec_helper.rb +7 -0
- metadata +71 -0
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
tmp/**
|
data/LICENSE
ADDED
File without changes
|
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
A crawler which use regrex expression to catch data.
|
data/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
require 'spec/rake/spectask'
|
4
|
+
require 'jeweler'
|
5
|
+
|
6
|
+
desc "Run all specs in spec directory"
|
7
|
+
Spec::Rake::SpecTask.new(:spec) do |t|
|
8
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
9
|
+
t.rcov = true
|
10
|
+
t.rcov_opts = ['--exclude', 'spec,config,Library,usr/lib/ruby']
|
11
|
+
t.rcov_dir = File.join(File.dirname(__FILE__), "tmp")
|
12
|
+
end
|
13
|
+
|
14
|
+
Jeweler::Tasks.new do |gemspec|
|
15
|
+
gemspec.name = "regexp_crawler"
|
16
|
+
gemspec.summary = "RegexpCrawler is a Ruby library for crawl data from website using regular expression."
|
17
|
+
gemspec.description = "RegexpCrawler is a Ruby library for crawl data from website using regular expression."
|
18
|
+
gemspec.email = "flyerhzm@gmail.com"
|
19
|
+
gemspec.homepage = ""
|
20
|
+
gemspec.authors = ["Richard Huang"]
|
21
|
+
end
|
data/TODO
ADDED
File without changes
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/init.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
module RegexpCrawler
|
2
|
+
class Crawler
|
3
|
+
attr_accessor :start_page, :continue_regexp, :named_captures, :model
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
@start_page = options[:start_page]
|
7
|
+
@continue_regexp = options[:continue_regexp]
|
8
|
+
@capture_regexp = options[:capture_regexp]
|
9
|
+
@named_captures = options[:named_captures]
|
10
|
+
@model = options[:model]
|
11
|
+
end
|
12
|
+
|
13
|
+
def capture_regexp=(regexp)
|
14
|
+
@capture_regexp = Regexp.new(regexp.source, regexp.options | Regexp::MULTILINE)
|
15
|
+
end
|
16
|
+
|
17
|
+
def start
|
18
|
+
results = []
|
19
|
+
@pages = [@start_page]
|
20
|
+
while !@pages.empty?
|
21
|
+
uri = URI.parse(@pages.shift)
|
22
|
+
result = parse_page(uri)
|
23
|
+
results << result if result
|
24
|
+
end
|
25
|
+
results
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_page(uri)
|
29
|
+
response = Net::HTTP.get_response(uri)
|
30
|
+
parse_response(response, uri)
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse_response(response, uri)
|
34
|
+
if response.is_a? Net::HTTPSuccess
|
35
|
+
response.body.scan(continue_regexp).each do |page|
|
36
|
+
url = page.start_with?(uri.scheme) ? page : "#{uri.scheme}://#{uri.host}/#{page}"
|
37
|
+
@pages << url
|
38
|
+
end if continue_regexp
|
39
|
+
md = @capture_regexp.match(response.body)
|
40
|
+
if md
|
41
|
+
model_result = model.new
|
42
|
+
captures = md.captures if md
|
43
|
+
captures.each_index do |i|
|
44
|
+
model_result.send("#{named_captures[i]}=", captures[i])
|
45
|
+
end
|
46
|
+
{:model => model_result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
|
47
|
+
end
|
48
|
+
elsif response.is_a? Net::HTTPRedirection
|
49
|
+
parse_page(URI.parse(response['location']))
|
50
|
+
else
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module RegexpCrawler
|
5
|
+
|
6
|
+
def self.included(base)
|
7
|
+
base.extend ClassMethods
|
8
|
+
end
|
9
|
+
|
10
|
+
module ClassMethods
|
11
|
+
def regexp_crawler(options)
|
12
|
+
@crawlers ||= []
|
13
|
+
@crawlers << Crawler.new(options)
|
14
|
+
end
|
15
|
+
|
16
|
+
def start_crawl
|
17
|
+
@crawlers.each do |crawler|
|
18
|
+
crawler.start
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
|
2
|
+
|
3
|
+
describe RegexpCrawler::Crawler do
|
4
|
+
class Post
|
5
|
+
attr_accessor :title, :date, :body
|
6
|
+
end
|
7
|
+
|
8
|
+
describe '#simple html' do
|
9
|
+
it 'should parse data according to regexp' do
|
10
|
+
success_page('/resources/simple.html', 'http://simple.com/')
|
11
|
+
|
12
|
+
crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => Post)
|
13
|
+
results = crawl.start
|
14
|
+
results.size.should == 1
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should redirect' do
|
18
|
+
redirect_page('http://redirect.com/', 'http://simple.com/')
|
19
|
+
success_page('/resources/simple.html', 'http://simple.com/')
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe '#complex html' do
|
24
|
+
before(:each) do
|
25
|
+
success_page('/resources/complex.html', 'http://complex.com/')
|
26
|
+
success_page('/resources/nested1.html', 'http://complex.com/nested1.html')
|
27
|
+
success_page('/resources/nested2.html', 'http://complex.com/nested2.html')
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should parse data according to regexp' do
|
31
|
+
crawl = RegexpCrawler::Crawler.new
|
32
|
+
crawl.start_page = 'http://complex.com/'
|
33
|
+
crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
|
34
|
+
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
|
35
|
+
crawl.named_captures = ['title', 'date', 'body']
|
36
|
+
crawl.model = Post
|
37
|
+
results = crawl.start
|
38
|
+
results.size.should == 2
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def success_page(local_path, remote_path)
|
43
|
+
path = File.expand_path(File.dirname(__FILE__) + local_path)
|
44
|
+
content = File.read(path)
|
45
|
+
http = mock(Net::HTTPSuccess)
|
46
|
+
http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true)
|
47
|
+
http.stubs(:body).returns(content)
|
48
|
+
Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
|
49
|
+
end
|
50
|
+
|
51
|
+
def redirect_page(remote_path, redirect_path)
|
52
|
+
http = mock(Net::HTTPRedirection)
|
53
|
+
http.stubs(:is_a?).with(Net::HTTPRedirection).returns(true)
|
54
|
+
Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
|
55
|
+
end
|
56
|
+
end
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: flyerhzm-regexp_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Richard Huang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-08-02 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: RegexpCrawler is a Ruby library for crawl data from website using regular expression.
|
17
|
+
email: flyerhzm@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- LICENSE
|
24
|
+
- README
|
25
|
+
files:
|
26
|
+
- .gitignore
|
27
|
+
- LICENSE
|
28
|
+
- README
|
29
|
+
- Rakefile
|
30
|
+
- TODO
|
31
|
+
- VERSION
|
32
|
+
- init.rb
|
33
|
+
- lib/regexp_crawler.rb
|
34
|
+
- lib/regexp_crawler/crawler.rb
|
35
|
+
- spec/regexp_crawler_spec.rb
|
36
|
+
- spec/resources/complex.html
|
37
|
+
- spec/resources/nested1.html
|
38
|
+
- spec/resources/nested2.html
|
39
|
+
- spec/resources/simple.html
|
40
|
+
- spec/spec.opts
|
41
|
+
- spec/spec_helper.rb
|
42
|
+
has_rdoc: false
|
43
|
+
homepage: ""
|
44
|
+
licenses:
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options:
|
47
|
+
- --charset=UTF-8
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
version:
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project:
|
65
|
+
rubygems_version: 1.3.5
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.
|
69
|
+
test_files:
|
70
|
+
- spec/spec_helper.rb
|
71
|
+
- spec/regexp_crawler_spec.rb
|