flyerhzm-regexp_crawler 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.0
1
+ 0.6.0
@@ -31,25 +31,33 @@ module RegexpCrawler
31
31
 
32
32
  private
33
33
  def parse_page(uri)
34
- response = Net::HTTP.start(uri.host, uri.port) do |http|
35
- http.get(uri.request_uri, headers)
36
- end
34
+ response = Net::HTTP.get_response_with_headers(uri, @headers)
37
35
  parse_response(response, uri)
38
36
  end
39
37
 
38
+ def continue_uri(uri, page)
39
+ if page.start_with?(uri.scheme)
40
+ URI.parse(page)
41
+ elsif page.start_with?('/')
42
+ URI.join(uri.scheme + '://' + uri.host, page)
43
+ else
44
+ URI.parse(uri.to_s.split('/')[0..-2].join('/') + '/' + page)
45
+ end
46
+ end
47
+
40
48
  def parse_response(response, uri)
41
- response_body = Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first if encoding
49
+ response_body = encoding.nil? ? response.body : Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first
42
50
  if response.is_a? Net::HTTPSuccess
43
51
  if continue_regexp
44
52
  response_body.scan(continue_regexp).each do |page|
45
53
  page = page.first if page.is_a? Array
46
- continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
54
+ continue_uri = continue_uri(uri, page)
47
55
  @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
48
56
  end
49
57
  end
50
58
  md = @capture_regexp.match(response_body)
51
59
  if md
52
- captures = md.captures if md
60
+ captures = md.captures
53
61
  result = {}
54
62
  captures.each_index do |i|
55
63
  result[named_captures[i].to_sym] = captures[i]
@@ -0,0 +1,9 @@
1
+ module Net
2
+ class HTTP
3
+ def HTTP.get_response_with_headers(uri, headers)
4
+ response = start(uri.host, uri.port) do |http|
5
+ http.get(uri.request_uri, headers)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{regexp_crawler}
8
- s.version = "0.5.0"
8
+ s.version = "0.6.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Richard Huang"]
12
- s.date = %q{2009-08-29}
12
+ s.date = %q{2009-08-30}
13
13
  s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
14
14
  s.email = %q{flyerhzm@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -25,11 +25,13 @@ Gem::Specification.new do |s|
25
25
  "init.rb",
26
26
  "lib/regexp_crawler.rb",
27
27
  "lib/regexp_crawler/crawler.rb",
28
+ "lib/regexp_crawler/http.rb",
28
29
  "regexp_crawler.gemspec",
29
30
  "spec/regexp_crawler_spec.rb",
30
31
  "spec/resources/complex.html",
31
32
  "spec/resources/nested1.html",
32
33
  "spec/resources/nested2.html",
34
+ "spec/resources/nested21.html",
33
35
  "spec/resources/simple.html",
34
36
  "spec/spec.opts",
35
37
  "spec/spec_helper.rb"
@@ -1,7 +1,7 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
2
2
 
3
3
  describe RegexpCrawler::Crawler do
4
- describe '#simple html' do
4
+ context '#simple html' do
5
5
  it 'should parse data according to regexp' do
6
6
  success_page('/resources/simple.html', 'http://simple.com/')
7
7
 
@@ -17,7 +17,7 @@ describe RegexpCrawler::Crawler do
17
17
  end
18
18
  end
19
19
 
20
- describe '#complex html' do
20
+ context '#complex html' do
21
21
  before(:each) do
22
22
  success_page('/resources/complex.html', 'http://complex.com/')
23
23
  success_page('/resources/nested1.html', 'http://complex.com/nested1.html')
@@ -27,7 +27,7 @@ describe RegexpCrawler::Crawler do
27
27
  it 'should parse data according to regexp' do
28
28
  crawl = RegexpCrawler::Crawler.new
29
29
  crawl.start_page = 'http://complex.com/'
30
- crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
30
+ crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
31
31
  crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
32
32
  crawl.named_captures = ['title', 'date', 'body']
33
33
  crawl.model = 'post'
@@ -37,10 +37,24 @@ describe RegexpCrawler::Crawler do
37
37
  results.last[:post][:title].should == 'nested2'
38
38
  end
39
39
 
40
+ it 'should parse nested of nested data' do
41
+ success_page('/resources/nested21.html', 'http://complex.com/nested21.html')
42
+ crawl = RegexpCrawler::Crawler.new
43
+ crawl.start_page = 'http://complex.com/'
44
+ crawl.continue_regexp = %r{(?:http://complex.com)?/?nested\d+.html}
45
+ crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
46
+ crawl.named_captures = ['title', 'date', 'body']
47
+ crawl.model = 'post'
48
+ results = crawl.start
49
+ results.size.should == 3
50
+ results.first[:post][:title].should == 'nested1'
51
+ results.last[:post][:title].should == 'nested21'
52
+ end
53
+
40
54
  it "should save by myself" do
41
55
  crawl = RegexpCrawler::Crawler.new
42
56
  crawl.start_page = 'http://complex.com/'
43
- crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
57
+ crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
44
58
  crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
45
59
  crawl.named_captures = ['title', 'date', 'body']
46
60
  my_results = []
@@ -53,7 +67,7 @@ describe RegexpCrawler::Crawler do
53
67
  it "should stop parse" do
54
68
  crawl = RegexpCrawler::Crawler.new
55
69
  crawl.start_page = 'http://complex.com/'
56
- crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
70
+ crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
57
71
  crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
58
72
  crawl.named_captures = ['title', 'date', 'body']
59
73
  stop_page = "http://complex.com/nested1.html"
@@ -76,12 +90,12 @@ describe RegexpCrawler::Crawler do
76
90
  http = mock(Net::HTTPSuccess)
77
91
  http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true)
78
92
  http.stubs(:body).returns(content)
79
- Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
93
+ Net::HTTP.expects(:get_response_with_headers).times(1).with(URI.parse(remote_path), nil).returns(http)
80
94
  end
81
95
 
82
96
  def redirect_page(remote_path, redirect_path)
83
97
  http = mock(Net::HTTPRedirection)
84
98
  http.stubs(:is_a?).with(Net::HTTPRedirection).returns(true)
85
- Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
99
+ Net::HTTP.expects(:get_response_with_headers).times(1).with(URI.parse(remote_path), nil).returns(http)
86
100
  end
87
101
  end
@@ -4,8 +4,8 @@
4
4
  </head>
5
5
  <body>
6
6
  <div>
7
- <a link="nested1.html">nested1</a>
8
- <a link="http://complex.com/nested2.html">nested2</a>
7
+ <a href="/nested1.html">nested1</a>
8
+ <a href="http://complex.com/nested2.html">nested2</a>
9
9
  </div>
10
10
  </body>
11
11
  </html>
@@ -7,6 +7,7 @@
7
7
  <div class="title">nested2</div>
8
8
  <div class="date">2008/10/10</div>
9
9
  <div class="body"><p class="content">nested2</p></div>
10
+ <a href="nested21.html">nested21</a>
10
11
  </div>
11
12
  </body>
12
13
  </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ <title>nested2 test html</title>
4
+ </head>
5
+ <body>
6
+ <div>
7
+ <div class="title">nested21</div>
8
+ <div class="date">2008/11/11</div>
9
+ <div class="body"><p class="content">nested21</p></div>
10
+ </div>
11
+ </body>
12
+ </html>
data/spec/spec_helper.rb CHANGED
@@ -5,3 +5,4 @@ require 'mocha'
5
5
 
6
6
  require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler.rb')
7
7
  require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler/crawler.rb')
8
+ require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler/http.rb')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flyerhzm-regexp_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Huang
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-29 00:00:00 -07:00
12
+ date: 2009-08-30 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -31,17 +31,18 @@ files:
31
31
  - init.rb
32
32
  - lib/regexp_crawler.rb
33
33
  - lib/regexp_crawler/crawler.rb
34
+ - lib/regexp_crawler/http.rb
34
35
  - regexp_crawler.gemspec
35
36
  - spec/regexp_crawler_spec.rb
36
37
  - spec/resources/complex.html
37
38
  - spec/resources/nested1.html
38
39
  - spec/resources/nested2.html
40
+ - spec/resources/nested21.html
39
41
  - spec/resources/simple.html
40
42
  - spec/spec.opts
41
43
  - spec/spec_helper.rb
42
44
  has_rdoc: false
43
45
  homepage: ""
44
- licenses:
45
46
  post_install_message:
46
47
  rdoc_options:
47
48
  - --charset=UTF-8
@@ -62,7 +63,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
62
63
  requirements: []
63
64
 
64
65
  rubyforge_project:
65
- rubygems_version: 1.3.5
66
+ rubygems_version: 1.2.0
66
67
  signing_key:
67
68
  specification_version: 3
68
69
  summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.