RubyGems - flyerhzm-regexp_crawler - Versions diffs - 0.3.0 → 0.4.0 - Mend

flyerhzm-regexp_crawler 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/Rakefile +1 -0
data/VERSION +1 -1
data/lib/regexp_crawler/crawler.rb +15 -8
data/regexp_crawler.gemspec +3 -4
data/spec/regexp_crawler_spec.rb +34 -0
metadata +3 -5
data/.gitignore +0 -1

data/Rakefile CHANGED Viewed

@@ -18,4 +18,5 @@ Jeweler::Tasks.new do |gemspec|
   gemspec.email = "flyerhzm@gmail.com"
   gemspec.homepage = ""
   gemspec.authors = ["Richard Huang"]
+  gemspec.files.exclude '.gitignore'
 end

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.2.0
1	+ 0.4.0

data/lib/regexp_crawler/crawler.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module RegexpCrawler
   class Crawler
-    attr_accessor :start_page, :continue_regexp, :named_captures, :model
+    attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method
     def initialize(options = {})
       @start_page = options[:start_page]
@@ -8,6 +8,7 @@ module RegexpCrawler
       @capture_regexp = options[:capture_regexp]
       @named_captures = options[:named_captures]
       @model = options[:model]
+      @save_method = options[:save_method]
     end
     def capture_regexp=(regexp)
@@ -15,16 +16,15 @@ module RegexpCrawler
     end
     def start
-      results = []
+      @results = []
       @captured_pages = []
       @pages = [URI.parse(@start_page)]
-      while !@pages.empty?
+      while !@pages.empty? and !@stop
         uri = @pages.shift
         @captured_pages << uri
-        result = parse_page(uri)
-        results << result if result
+        parse_page(uri)
       end
-      results
+      @results
     end
     private
@@ -49,12 +49,19 @@ module RegexpCrawler
             captures.each_index do |i|
               result[named_captures[i].to_sym] = captures[i]
             end
-            {@model.downcase.to_sym => result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
+            url = "#{uri.scheme}://#{uri.host}#{uri.path}"
+            if @save_method
+              ret = @save_method.call(result, url)
+              @stop = true if ret == false
+            else
+              @results << {@model.downcase.to_sym => result, :page => url}
+            end
           end
         elsif response.is_a? Net::HTTPRedirection
           parse_page(URI.parse(response['location']))
         else
+          # do nothing
         end
       end
-    end
+  end
 end

data/regexp_crawler.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{regexp_crawler}
-  s.version = "0.3.0"
+  s.version = "0.4.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Richard Huang"]
-  s.date = %q{2009-08-22}
+  s.date = %q{2009-08-29}
   s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
   s.email = %q{flyerhzm@gmail.com}
   s.extra_rdoc_files = [
@@ -17,8 +17,7 @@ Gem::Specification.new do |s|
      "README.textile"
   ]
   s.files = [
-    ".gitignore",
-     "LICENSE",
+    "LICENSE",
      "README.textile",
      "Rakefile",
      "TODO",

data/spec/regexp_crawler_spec.rb CHANGED Viewed

@@ -36,6 +36,40 @@ describe RegexpCrawler::Crawler do
       results.first[:post][:title].should == 'nested1'
       results.last[:post][:title].should == 'nested2'
     end
+    it "should save by myself" do
+      crawl = RegexpCrawler::Crawler.new
+      crawl.start_page = 'http://complex.com/'
+      crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
+      crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
+      crawl.named_captures = ['title', 'date', 'body']
+      crawl.model = 'post'
+      my_results = []
+      crawl.save_method = Proc.new {|result, page| my_results << result}
+      results = crawl.start
+      results.size.should == 0
+      my_results.size.should == 2
+    end
+    it "should stop parse" do
+      crawl = RegexpCrawler::Crawler.new
+      crawl.start_page = 'http://complex.com/'
+      crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
+      crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
+      crawl.named_captures = ['title', 'date', 'body']
+      crawl.model = 'post'
+      stop_page = "http://complex.com/nested1.html"
+      parse_pages = []
+      crawl.save_method = Proc.new do |result, page|
+        if page == stop_page
+          false
+        else
+          parse_pages << page
+        end
+      end
+      results = crawl.start
+      parse_pages.size.should == 0
+    end
   end
   def success_page(local_path, remote_path)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: flyerhzm-regexp_crawler
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - Richard Huang
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-22 00:00:00 -07:00
+date: 2009-08-29 00:00:00 -07:00
 default_executable:
 dependencies: []
@@ -23,7 +23,6 @@ extra_rdoc_files:
 - LICENSE
 - README.textile
 files:
-- .gitignore
 - LICENSE
 - README.textile
 - Rakefile
@@ -42,7 +41,6 @@ files:
 - spec/spec_helper.rb
 has_rdoc: false
 homepage: ""
-licenses:
 post_install_message:
 rdoc_options:
 - --charset=UTF-8
@@ -63,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.5
+rubygems_version: 1.2.0
 signing_key:
 specification_version: 3
 summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.

data/.gitignore DELETED Viewed

	@@ -1 +0,0 @@
1	- tmp/**