flyerhzm-regexp_crawler 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.0
1
+ 0.5.0
@@ -1,6 +1,6 @@
1
1
  module RegexpCrawler
2
2
  class Crawler
3
- attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method
3
+ attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method, :headers, :encoding
4
4
 
5
5
  def initialize(options = {})
6
6
  @start_page = options[:start_page]
@@ -9,6 +9,8 @@ module RegexpCrawler
9
9
  @named_captures = options[:named_captures]
10
10
  @model = options[:model]
11
11
  @save_method = options[:save_method]
12
+ @headers = options[:headers]
13
+ @encoding = options[:encoding]
12
14
  end
13
15
 
14
16
  def capture_regexp=(regexp)
@@ -29,32 +31,34 @@ module RegexpCrawler
29
31
 
30
32
  private
31
33
  def parse_page(uri)
32
- response = Net::HTTP.get_response(uri)
34
+ response = Net::HTTP.start(uri.host, uri.port) do |http|
35
+ http.get(uri.request_uri, headers)
36
+ end
33
37
  parse_response(response, uri)
34
38
  end
35
39
 
36
40
  def parse_response(response, uri)
41
+ response_body = Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first if encoding
37
42
  if response.is_a? Net::HTTPSuccess
38
43
  if continue_regexp
39
- response.body.scan(continue_regexp).each do |page|
44
+ response_body.scan(continue_regexp).each do |page|
40
45
  page = page.first if page.is_a? Array
41
46
  continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
42
47
  @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
43
48
  end
44
49
  end
45
- md = @capture_regexp.match(response.body)
50
+ md = @capture_regexp.match(response_body)
46
51
  if md
47
52
  captures = md.captures if md
48
53
  result = {}
49
54
  captures.each_index do |i|
50
55
  result[named_captures[i].to_sym] = captures[i]
51
56
  end
52
- url = "#{uri.scheme}://#{uri.host}#{uri.path}"
53
57
  if @save_method
54
- ret = @save_method.call(result, url)
58
+ ret = @save_method.call(result, uri.to_s)
55
59
  @stop = true if ret == false
56
60
  else
57
- @results << {@model.downcase.to_sym => result, :page => url}
61
+ @results << {@model.downcase.to_sym => result, :page => uri.to_s}
58
62
  end
59
63
  end
60
64
  elsif response.is_a? Net::HTTPRedirection
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{regexp_crawler}
8
- s.version = "0.4.0"
8
+ s.version = "0.5.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Richard Huang"]
@@ -43,7 +43,6 @@ describe RegexpCrawler::Crawler do
43
43
  crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
44
44
  crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
45
45
  crawl.named_captures = ['title', 'date', 'body']
46
- crawl.model = 'post'
47
46
  my_results = []
48
47
  crawl.save_method = Proc.new {|result, page| my_results << result}
49
48
  results = crawl.start
@@ -57,7 +56,6 @@ describe RegexpCrawler::Crawler do
57
56
  crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
58
57
  crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
59
58
  crawl.named_captures = ['title', 'date', 'body']
60
- crawl.model = 'post'
61
59
  stop_page = "http://complex.com/nested1.html"
62
60
  parse_pages = []
63
61
  crawl.save_method = Proc.new do |result, page|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flyerhzm-regexp_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Huang
@@ -41,6 +41,7 @@ files:
41
41
  - spec/spec_helper.rb
42
42
  has_rdoc: false
43
43
  homepage: ""
44
+ licenses:
44
45
  post_install_message:
45
46
  rdoc_options:
46
47
  - --charset=UTF-8
@@ -61,7 +62,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
61
62
  requirements: []
62
63
 
63
64
  rubyforge_project:
64
- rubygems_version: 1.2.0
65
+ rubygems_version: 1.3.5
65
66
  signing_key:
66
67
  specification_version: 3
67
68
  summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.