RubyGems - flyerhzm-regexp_crawler - Versions diffs - 0.6.0 → 0.7.0 - Mend

flyerhzm-regexp_crawler 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/VERSION +1 -1
data/lib/regexp_crawler/crawler.rb +16 -13
data/lib/regexp_crawler.rb +1 -0
data/regexp_crawler.gemspec +1 -1
data/spec/regexp_crawler_spec.rb +21 -0
data/spec/resources/nested21.html +1 -1
metadata +3 -2

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.6.0
1	+ 0.7.0

data/lib/regexp_crawler/crawler.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module RegexpCrawler
   class Crawler
-    attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method, :headers, :encoding
+    attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method, :headers, :encoding, :need_parse
     def initialize(options = {})
       @start_page = options[:start_page]
@@ -11,6 +11,7 @@ module RegexpCrawler
       @save_method = options[:save_method]
       @headers = options[:headers]
       @encoding = options[:encoding]
+      @need_parse = options[:need_parse]
     end
     def capture_regexp=(regexp)
@@ -55,18 +56,20 @@ module RegexpCrawler
               @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
             end
           end
-          md = @capture_regexp.match(response_body)
-          if md
-            captures = md.captures
-            result = {}
-            captures.each_index do |i|
-              result[named_captures[i].to_sym] = captures[i]
-            end
-            if @save_method
-              ret = @save_method.call(result, uri.to_s)
-              @stop = true if ret == false
-            else
-              @results << {@model.downcase.to_sym => result, :page => uri.to_s}
+          if @need_parse.nil? or @need_parse.call(uri, response_body)
+            md = @capture_regexp.match(response_body)
+            if md
+              captures = md.captures
+              result = {}
+              captures.each_index do |i|
+                result[named_captures[i].to_sym] = captures[i]
+              end
+              if @save_method
+                ret = @save_method.call(result, uri.to_s)
+                @stop = true if ret == false
+              else
+                @results << {@model.downcase.to_sym => result, :page => uri.to_s}
+              end
             end
           end
         elsif response.is_a? Net::HTTPRedirection

data/lib/regexp_crawler.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'net/http'
 require 'uri'
+require 'regexp_crawler/http'
 module RegexpCrawler

data/regexp_crawler.gemspec CHANGED Viewed

@@ -5,7 +5,7 @@
 Gem::Specification.new do |s|
   s.name = %q{regexp_crawler}
-  s.version = "0.6.0"
+  s.version = "0.7.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Richard Huang"]

data/spec/regexp_crawler_spec.rb CHANGED Viewed

@@ -82,6 +82,27 @@ describe RegexpCrawler::Crawler do
       results = crawl.start
       parse_pages.size.should == 0
     end
+    it 'should parse skip nested2.html' do
+      success_page('/resources/nested21.html', 'http://complex.com/nested21.html')
+      crawl = RegexpCrawler::Crawler.new
+      crawl.start_page = 'http://complex.com/'
+      crawl.continue_regexp = %r{(?:http://complex.com)?/?nested\d+.html}
+      crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
+      crawl.named_captures = ['title', 'date', 'body']
+      crawl.model = 'post'
+      crawl.need_parse = Proc.new do |uri, response_body|
+        if response_body.index('nested2 test html')
+          false
+        else
+          true
+        end
+      end
+      results = crawl.start
+      results.size.should == 2
+      results.first[:post][:title].should == 'nested1'
+      results.last[:post][:title].should == 'nested21'
+    end
   end
   def success_page(local_path, remote_path)

data/spec/resources/nested21.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <html>
   <head>
-    <title>nested2 test html</title>
+    <title>nested21 test html</title>
   </head>
   <body>
     <div>

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: flyerhzm-regexp_crawler
 version: !ruby/object:Gem::Version
-  version: 0.6.0
+  version: 0.7.0
 platform: ruby
 authors:
 - Richard Huang
@@ -43,6 +43,7 @@ files:
 - spec/spec_helper.rb
 has_rdoc: false
 homepage: ""
+licenses:
 post_install_message:
 rdoc_options:
 - --charset=UTF-8
@@ -63,7 +64,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.2.0
+rubygems_version: 1.3.5
 signing_key:
 specification_version: 3
 summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.