flyerhzm-regexp_crawler 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/lib/regexp_crawler/crawler.rb +15 -8
- data/regexp_crawler.gemspec +3 -4
- data/spec/regexp_crawler_spec.rb +34 -0
- metadata +3 -5
- data/.gitignore +0 -1
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module RegexpCrawler
|
2
2
|
class Crawler
|
3
|
-
attr_accessor :start_page, :continue_regexp, :named_captures, :model
|
3
|
+
attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method
|
4
4
|
|
5
5
|
def initialize(options = {})
|
6
6
|
@start_page = options[:start_page]
|
@@ -8,6 +8,7 @@ module RegexpCrawler
|
|
8
8
|
@capture_regexp = options[:capture_regexp]
|
9
9
|
@named_captures = options[:named_captures]
|
10
10
|
@model = options[:model]
|
11
|
+
@save_method = options[:save_method]
|
11
12
|
end
|
12
13
|
|
13
14
|
def capture_regexp=(regexp)
|
@@ -15,16 +16,15 @@ module RegexpCrawler
|
|
15
16
|
end
|
16
17
|
|
17
18
|
def start
|
18
|
-
results = []
|
19
|
+
@results = []
|
19
20
|
@captured_pages = []
|
20
21
|
@pages = [URI.parse(@start_page)]
|
21
|
-
while !@pages.empty?
|
22
|
+
while !@pages.empty? and !@stop
|
22
23
|
uri = @pages.shift
|
23
24
|
@captured_pages << uri
|
24
|
-
|
25
|
-
results << result if result
|
25
|
+
parse_page(uri)
|
26
26
|
end
|
27
|
-
results
|
27
|
+
@results
|
28
28
|
end
|
29
29
|
|
30
30
|
private
|
@@ -49,12 +49,19 @@ module RegexpCrawler
|
|
49
49
|
captures.each_index do |i|
|
50
50
|
result[named_captures[i].to_sym] = captures[i]
|
51
51
|
end
|
52
|
-
|
52
|
+
url = "#{uri.scheme}://#{uri.host}#{uri.path}"
|
53
|
+
if @save_method
|
54
|
+
ret = @save_method.call(result, url)
|
55
|
+
@stop = true if ret == false
|
56
|
+
else
|
57
|
+
@results << {@model.downcase.to_sym => result, :page => url}
|
58
|
+
end
|
53
59
|
end
|
54
60
|
elsif response.is_a? Net::HTTPRedirection
|
55
61
|
parse_page(URI.parse(response['location']))
|
56
62
|
else
|
63
|
+
# do nothing
|
57
64
|
end
|
58
65
|
end
|
59
|
-
|
66
|
+
end
|
60
67
|
end
|
data/regexp_crawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{regexp_crawler}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Richard Huang"]
|
12
|
-
s.date = %q{2009-08-
|
12
|
+
s.date = %q{2009-08-29}
|
13
13
|
s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
|
14
14
|
s.email = %q{flyerhzm@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -17,8 +17,7 @@ Gem::Specification.new do |s|
|
|
17
17
|
"README.textile"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
|
-
"
|
21
|
-
"LICENSE",
|
20
|
+
"LICENSE",
|
22
21
|
"README.textile",
|
23
22
|
"Rakefile",
|
24
23
|
"TODO",
|
data/spec/regexp_crawler_spec.rb
CHANGED
@@ -36,6 +36,40 @@ describe RegexpCrawler::Crawler do
|
|
36
36
|
results.first[:post][:title].should == 'nested1'
|
37
37
|
results.last[:post][:title].should == 'nested2'
|
38
38
|
end
|
39
|
+
|
40
|
+
it "should save by myself" do
|
41
|
+
crawl = RegexpCrawler::Crawler.new
|
42
|
+
crawl.start_page = 'http://complex.com/'
|
43
|
+
crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
|
44
|
+
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
|
45
|
+
crawl.named_captures = ['title', 'date', 'body']
|
46
|
+
crawl.model = 'post'
|
47
|
+
my_results = []
|
48
|
+
crawl.save_method = Proc.new {|result, page| my_results << result}
|
49
|
+
results = crawl.start
|
50
|
+
results.size.should == 0
|
51
|
+
my_results.size.should == 2
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should stop parse" do
|
55
|
+
crawl = RegexpCrawler::Crawler.new
|
56
|
+
crawl.start_page = 'http://complex.com/'
|
57
|
+
crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
|
58
|
+
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
|
59
|
+
crawl.named_captures = ['title', 'date', 'body']
|
60
|
+
crawl.model = 'post'
|
61
|
+
stop_page = "http://complex.com/nested1.html"
|
62
|
+
parse_pages = []
|
63
|
+
crawl.save_method = Proc.new do |result, page|
|
64
|
+
if page == stop_page
|
65
|
+
false
|
66
|
+
else
|
67
|
+
parse_pages << page
|
68
|
+
end
|
69
|
+
end
|
70
|
+
results = crawl.start
|
71
|
+
parse_pages.size.should == 0
|
72
|
+
end
|
39
73
|
end
|
40
74
|
|
41
75
|
def success_page(local_path, remote_path)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flyerhzm-regexp_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Huang
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-08-
|
12
|
+
date: 2009-08-29 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -23,7 +23,6 @@ extra_rdoc_files:
|
|
23
23
|
- LICENSE
|
24
24
|
- README.textile
|
25
25
|
files:
|
26
|
-
- .gitignore
|
27
26
|
- LICENSE
|
28
27
|
- README.textile
|
29
28
|
- Rakefile
|
@@ -42,7 +41,6 @@ files:
|
|
42
41
|
- spec/spec_helper.rb
|
43
42
|
has_rdoc: false
|
44
43
|
homepage: ""
|
45
|
-
licenses:
|
46
44
|
post_install_message:
|
47
45
|
rdoc_options:
|
48
46
|
- --charset=UTF-8
|
@@ -63,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
63
61
|
requirements: []
|
64
62
|
|
65
63
|
rubyforge_project:
|
66
|
-
rubygems_version: 1.
|
64
|
+
rubygems_version: 1.2.0
|
67
65
|
signing_key:
|
68
66
|
specification_version: 3
|
69
67
|
summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.
|
data/.gitignore
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
tmp/**
|