flyerhzm-regexp_crawler 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -18,4 +18,5 @@ Jeweler::Tasks.new do |gemspec|
18
18
  gemspec.email = "flyerhzm@gmail.com"
19
19
  gemspec.homepage = ""
20
20
  gemspec.authors = ["Richard Huang"]
21
+ gemspec.files.exclude '.gitignore'
21
22
  end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.4.0
@@ -1,6 +1,6 @@
1
1
  module RegexpCrawler
2
2
  class Crawler
3
- attr_accessor :start_page, :continue_regexp, :named_captures, :model
3
+ attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method
4
4
 
5
5
  def initialize(options = {})
6
6
  @start_page = options[:start_page]
@@ -8,6 +8,7 @@ module RegexpCrawler
8
8
  @capture_regexp = options[:capture_regexp]
9
9
  @named_captures = options[:named_captures]
10
10
  @model = options[:model]
11
+ @save_method = options[:save_method]
11
12
  end
12
13
 
13
14
  def capture_regexp=(regexp)
@@ -15,16 +16,15 @@ module RegexpCrawler
15
16
  end
16
17
 
17
18
  def start
18
- results = []
19
+ @results = []
19
20
  @captured_pages = []
20
21
  @pages = [URI.parse(@start_page)]
21
- while !@pages.empty?
22
+ while !@pages.empty? and !@stop
22
23
  uri = @pages.shift
23
24
  @captured_pages << uri
24
- result = parse_page(uri)
25
- results << result if result
25
+ parse_page(uri)
26
26
  end
27
- results
27
+ @results
28
28
  end
29
29
 
30
30
  private
@@ -49,12 +49,19 @@ module RegexpCrawler
49
49
  captures.each_index do |i|
50
50
  result[named_captures[i].to_sym] = captures[i]
51
51
  end
52
- {@model.downcase.to_sym => result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
52
+ url = "#{uri.scheme}://#{uri.host}#{uri.path}"
53
+ if @save_method
54
+ ret = @save_method.call(result, url)
55
+ @stop = true if ret == false
56
+ else
57
+ @results << {@model.downcase.to_sym => result, :page => url}
58
+ end
53
59
  end
54
60
  elsif response.is_a? Net::HTTPRedirection
55
61
  parse_page(URI.parse(response['location']))
56
62
  else
63
+ # do nothing
57
64
  end
58
65
  end
59
- end
66
+ end
60
67
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{regexp_crawler}
8
- s.version = "0.3.0"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Richard Huang"]
12
- s.date = %q{2009-08-22}
12
+ s.date = %q{2009-08-29}
13
13
  s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
14
14
  s.email = %q{flyerhzm@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -17,8 +17,7 @@ Gem::Specification.new do |s|
17
17
  "README.textile"
18
18
  ]
19
19
  s.files = [
20
- ".gitignore",
21
- "LICENSE",
20
+ "LICENSE",
22
21
  "README.textile",
23
22
  "Rakefile",
24
23
  "TODO",
@@ -36,6 +36,40 @@ describe RegexpCrawler::Crawler do
36
36
  results.first[:post][:title].should == 'nested1'
37
37
  results.last[:post][:title].should == 'nested2'
38
38
  end
39
+
40
+ it "should save by myself" do
41
+ crawl = RegexpCrawler::Crawler.new
42
+ crawl.start_page = 'http://complex.com/'
43
+ crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
44
+ crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
45
+ crawl.named_captures = ['title', 'date', 'body']
46
+ crawl.model = 'post'
47
+ my_results = []
48
+ crawl.save_method = Proc.new {|result, page| my_results << result}
49
+ results = crawl.start
50
+ results.size.should == 0
51
+ my_results.size.should == 2
52
+ end
53
+
54
+ it "should stop parse" do
55
+ crawl = RegexpCrawler::Crawler.new
56
+ crawl.start_page = 'http://complex.com/'
57
+ crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
58
+ crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
59
+ crawl.named_captures = ['title', 'date', 'body']
60
+ crawl.model = 'post'
61
+ stop_page = "http://complex.com/nested1.html"
62
+ parse_pages = []
63
+ crawl.save_method = Proc.new do |result, page|
64
+ if page == stop_page
65
+ false
66
+ else
67
+ parse_pages << page
68
+ end
69
+ end
70
+ results = crawl.start
71
+ parse_pages.size.should == 0
72
+ end
39
73
  end
40
74
 
41
75
  def success_page(local_path, remote_path)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flyerhzm-regexp_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Huang
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-22 00:00:00 -07:00
12
+ date: 2009-08-29 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -23,7 +23,6 @@ extra_rdoc_files:
23
23
  - LICENSE
24
24
  - README.textile
25
25
  files:
26
- - .gitignore
27
26
  - LICENSE
28
27
  - README.textile
29
28
  - Rakefile
@@ -42,7 +41,6 @@ files:
42
41
  - spec/spec_helper.rb
43
42
  has_rdoc: false
44
43
  homepage: ""
45
- licenses:
46
44
  post_install_message:
47
45
  rdoc_options:
48
46
  - --charset=UTF-8
@@ -63,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
63
61
  requirements: []
64
62
 
65
63
  rubyforge_project:
66
- rubygems_version: 1.3.5
64
+ rubygems_version: 1.2.0
67
65
  signing_key:
68
66
  specification_version: 3
69
67
  summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.
data/.gitignore DELETED
@@ -1 +0,0 @@
1
- tmp/**