flyerhzm-regexp_crawler 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.0
1
+ 0.7.0
@@ -1,6 +1,6 @@
1
1
  module RegexpCrawler
2
2
  class Crawler
3
- attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method, :headers, :encoding
3
+ attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method, :headers, :encoding, :need_parse
4
4
 
5
5
  def initialize(options = {})
6
6
  @start_page = options[:start_page]
@@ -11,6 +11,7 @@ module RegexpCrawler
11
11
  @save_method = options[:save_method]
12
12
  @headers = options[:headers]
13
13
  @encoding = options[:encoding]
14
+ @need_parse = options[:need_parse]
14
15
  end
15
16
 
16
17
  def capture_regexp=(regexp)
@@ -55,18 +56,20 @@ module RegexpCrawler
55
56
  @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
56
57
  end
57
58
  end
58
- md = @capture_regexp.match(response_body)
59
- if md
60
- captures = md.captures
61
- result = {}
62
- captures.each_index do |i|
63
- result[named_captures[i].to_sym] = captures[i]
64
- end
65
- if @save_method
66
- ret = @save_method.call(result, uri.to_s)
67
- @stop = true if ret == false
68
- else
69
- @results << {@model.downcase.to_sym => result, :page => uri.to_s}
59
+ if @need_parse.nil? or @need_parse.call(uri, response_body)
60
+ md = @capture_regexp.match(response_body)
61
+ if md
62
+ captures = md.captures
63
+ result = {}
64
+ captures.each_index do |i|
65
+ result[named_captures[i].to_sym] = captures[i]
66
+ end
67
+ if @save_method
68
+ ret = @save_method.call(result, uri.to_s)
69
+ @stop = true if ret == false
70
+ else
71
+ @results << {@model.downcase.to_sym => result, :page => uri.to_s}
72
+ end
70
73
  end
71
74
  end
72
75
  elsif response.is_a? Net::HTTPRedirection
@@ -1,5 +1,6 @@
1
1
  require 'net/http'
2
2
  require 'uri'
3
+ require 'regexp_crawler/http'
3
4
 
4
5
  module RegexpCrawler
5
6
 
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{regexp_crawler}
8
- s.version = "0.6.0"
8
+ s.version = "0.7.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Richard Huang"]
@@ -82,6 +82,27 @@ describe RegexpCrawler::Crawler do
82
82
  results = crawl.start
83
83
  parse_pages.size.should == 0
84
84
  end
85
+
86
+ it 'should parse skip nested2.html' do
87
+ success_page('/resources/nested21.html', 'http://complex.com/nested21.html')
88
+ crawl = RegexpCrawler::Crawler.new
89
+ crawl.start_page = 'http://complex.com/'
90
+ crawl.continue_regexp = %r{(?:http://complex.com)?/?nested\d+.html}
91
+ crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
92
+ crawl.named_captures = ['title', 'date', 'body']
93
+ crawl.model = 'post'
94
+ crawl.need_parse = Proc.new do |uri, response_body|
95
+ if response_body.index('nested2 test html')
96
+ false
97
+ else
98
+ true
99
+ end
100
+ end
101
+ results = crawl.start
102
+ results.size.should == 2
103
+ results.first[:post][:title].should == 'nested1'
104
+ results.last[:post][:title].should == 'nested21'
105
+ end
85
106
  end
86
107
 
87
108
  def success_page(local_path, remote_path)
@@ -1,6 +1,6 @@
1
1
  <html>
2
2
  <head>
3
- <title>nested2 test html</title>
3
+ <title>nested21 test html</title>
4
4
  </head>
5
5
  <body>
6
6
  <div>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flyerhzm-regexp_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Huang
@@ -43,6 +43,7 @@ files:
43
43
  - spec/spec_helper.rb
44
44
  has_rdoc: false
45
45
  homepage: ""
46
+ licenses:
46
47
  post_install_message:
47
48
  rdoc_options:
48
49
  - --charset=UTF-8
@@ -63,7 +64,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
63
64
  requirements: []
64
65
 
65
66
  rubyforge_project:
66
- rubygems_version: 1.2.0
67
+ rubygems_version: 1.3.5
67
68
  signing_key:
68
69
  specification_version: 3
69
70
  summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.