flyerhzm-regexp_crawler 0.8.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -31,6 +31,7 @@ options is a hash
31
31
  * <code>:headers</code>, optional, a hash to define http headers
32
32
  * <code>:encoding</code>, optional, a string of the coding of crawled page, the results will be converted to utf8
33
33
  * <code>:need_parse</code>, optional, a proc if parsing the page by regexp or not, the proc accept two parameters, first is the crawled website uri, second is the response body of crawled page
34
+ * <code>:logger</code>, optional, true for logging to STDOUT, or a Logger object for logging to that logger
34
35
 
35
36
  If the crawler define :model no :save_method, the RegexpCrawler::Crawler#start will return an array of results, such as
36
37
  <pre><code>
@@ -41,7 +42,7 @@ If the crawler define :model no :save_method, the RegexpCrawler::Crawler#start w
41
42
 
42
43
  h2. Example
43
44
 
44
- a script to synchronize your github projects except fork projects
45
+ a script to synchronize your github projects except fork projects, please check <code>example/github_projects.rb</code>
45
46
 
46
47
  <pre><code>
47
48
  require 'rubygems'
@@ -49,8 +50,8 @@ require 'regexp_crawler'
49
50
 
50
51
  crawler = RegexpCrawler::Crawler.new(
51
52
  :start_page => "http://github.com/flyerhzm",
52
- :continue_regexp => %r{<div class="title"><b><a href="(/flyerhzm/.*?/tree)">}m,
53
- :capture_regexp => %r{<a href="http://github.com/flyerhzm/.*?/tree">(.*?)</a>.*<span id="repository_description".*?>(.*?)</span>.*(<div class="(?:wikistyle|plain)">.*?</div>)</div>}m,
53
+ :continue_regexp => %r{<div class="title"><b><a href="(/flyerhzm/.*?)">}m,
54
+ :capture_regexp => %r{<a href="http://github.com/flyerhzm/[^/"]*?(?:/tree)?">(.*?)</a>.*<span id="repository_description".*?>(.*?)</span>.*(<div class="(?:wikistyle|plain)">.*?</div>)</div>}m,
54
55
  :named_captures => ['title', 'description', 'body'],
55
56
  :save_method => Proc.new do |result, page|
56
57
  puts '============================='
@@ -60,10 +61,10 @@ crawler = RegexpCrawler::Crawler.new(
60
61
  puts result[:body][0..100] + "..."
61
62
  end,
62
63
  :need_parse => Proc.new do |page, response_body|
63
- !response_body.index "Fork of"
64
+ page =~ %r{http://github.com/flyerhzm/\w+} && !response_body.index(/Fork of.*?<a href=".*?">/)
64
65
  end)
65
66
  crawler.start
66
- </code></pre>
67
+ </pre></code>
67
68
 
68
69
  The results are as follows:
69
70
  <pre><code>
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.2
1
+ 0.9.0
@@ -1,6 +1,7 @@
1
1
  require 'net/http'
2
2
  require 'uri'
3
3
  require 'iconv'
4
+ require 'logger'
4
5
  require 'regexp_crawler/http'
5
6
  require 'regexp_crawler/crawler'
6
7
 
@@ -12,6 +12,7 @@ module RegexpCrawler
12
12
  @headers = options[:headers]
13
13
  @encoding = options[:encoding]
14
14
  @need_parse = options[:need_parse]
15
+ @logger = options[:logger] == true ? Logger.new(STDOUT) : options[:logger]
15
16
  end
16
17
 
17
18
  def capture_regexp=(regexp)
@@ -32,6 +33,7 @@ module RegexpCrawler
32
33
 
33
34
  private
34
35
  def parse_page(uri)
36
+ @logger.debug "crawling page: #{uri.to_s}" if @logger
35
37
  response = Net::HTTP.get_response_with_headers(uri, @headers)
36
38
  parse_response(response, uri)
37
39
  end
@@ -49,8 +51,10 @@ module RegexpCrawler
49
51
  def parse_response(response, uri)
50
52
  response_body = encoding.nil? ? response.body : Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first
51
53
  if response.is_a? Net::HTTPSuccess
54
+ @logger.debug "crawling success: #{uri.to_s}" if @logger
52
55
  if continue_regexp
53
56
  response_body.scan(continue_regexp).each do |page|
57
+ @logger.debug "continue_page: #{page}" if @logger
54
58
  page = page.compact.first if page.is_a? Array
55
59
  continue_uri = continue_uri(uri, page)
56
60
  @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
@@ -59,6 +63,7 @@ module RegexpCrawler
59
63
  if @need_parse.nil? or @need_parse.call(uri.to_s, response_body)
60
64
  md = @capture_regexp.match(response_body)
61
65
  if md
66
+ @logger.debug "response body captured" if @logger
62
67
  captures = md.captures
63
68
  result = {}
64
69
  captures.each_index do |i|
@@ -73,8 +78,10 @@ module RegexpCrawler
73
78
  end
74
79
  end
75
80
  elsif response.is_a? Net::HTTPRedirection
81
+ @logger.debug "crawling redirect: #{response['location']}" if @logger
76
82
  parse_page(URI.parse(response['location']))
77
83
  else
84
+ @logger.debug "crawling nothing: #{uri.to_s}" if @logger
78
85
  # do nothing
79
86
  end
80
87
  end
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{regexp_crawler}
8
- s.version = "0.8.2"
8
+ s.version = "0.9.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Richard Huang"]
@@ -20,7 +20,6 @@ Gem::Specification.new do |s|
20
20
  "LICENSE",
21
21
  "README.textile",
22
22
  "Rakefile",
23
- "TODO",
24
23
  "VERSION",
25
24
  "example/github_projects.rb",
26
25
  "init.rb",
@@ -5,7 +5,7 @@ describe RegexpCrawler::Crawler do
5
5
  it 'should parse data according to regexp' do
6
6
  success_page('/resources/simple.html', 'http://simple.com/')
7
7
 
8
- crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post')
8
+ crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post', :logger => true)
9
9
  results = crawl.start
10
10
  results.size.should == 1
11
11
  results.first[:post][:title].should == 'test'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flyerhzm-regexp_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Huang
@@ -26,7 +26,6 @@ files:
26
26
  - LICENSE
27
27
  - README.textile
28
28
  - Rakefile
29
- - TODO
30
29
  - VERSION
31
30
  - example/github_projects.rb
32
31
  - init.rb
data/TODO DELETED
File without changes