RubyGems - flyerhzm-regexp_crawler - Versions diffs - 0.5.0 → 0.6.0 - Mend

flyerhzm-regexp_crawler 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/VERSION +1 -1
data/lib/regexp_crawler/crawler.rb +14 -6
data/lib/regexp_crawler/http.rb +9 -0
data/regexp_crawler.gemspec +4 -2
data/spec/regexp_crawler_spec.rb +21 -7
data/spec/resources/complex.html +2 -2
data/spec/resources/nested2.html +1 -0
data/spec/resources/nested21.html +12 -0
data/spec/spec_helper.rb +1 -0
metadata +5 -4

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.5.0
1	+ 0.6.0

data/lib/regexp_crawler/crawler.rb CHANGED Viewed

@@ -31,25 +31,33 @@ module RegexpCrawler
     private
       def parse_page(uri)
-        response = Net::HTTP.start(uri.host, uri.port) do |http|
-          http.get(uri.request_uri, headers)
-        end
+        response = Net::HTTP.get_response_with_headers(uri, @headers)
         parse_response(response, uri)
       end
+      def continue_uri(uri, page)
+        if page.start_with?(uri.scheme)
+          URI.parse(page)
+        elsif page.start_with?('/')
+          URI.join(uri.scheme + '://' + uri.host, page)
+        else
+          URI.parse(uri.to_s.split('/')[0..-2].join('/') + '/' + page)
+        end
+      end
       def parse_response(response, uri)
-        response_body = Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first if encoding
+        response_body = encoding.nil? ? response.body : Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first
         if response.is_a? Net::HTTPSuccess
           if continue_regexp
             response_body.scan(continue_regexp).each do |page|
               page = page.first if page.is_a? Array
-              continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
+              continue_uri = continue_uri(uri, page)
               @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
             end
           end
           md = @capture_regexp.match(response_body)
           if md
-            captures = md.captures if md
+            captures = md.captures
             result = {}
             captures.each_index do |i|
               result[named_captures[i].to_sym] = captures[i]

data/lib/regexp_crawler/http.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Net
+  class HTTP
+    def HTTP.get_response_with_headers(uri, headers)
+      response = start(uri.host, uri.port) do |http|
+        http.get(uri.request_uri, headers)
+      end
+    end
+  end
+end

data/regexp_crawler.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{regexp_crawler}
-  s.version = "0.5.0"
+  s.version = "0.6.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Richard Huang"]
-  s.date = %q{2009-08-29}
+  s.date = %q{2009-08-30}
   s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
   s.email = %q{flyerhzm@gmail.com}
   s.extra_rdoc_files = [
@@ -25,11 +25,13 @@ Gem::Specification.new do |s|
      "init.rb",
      "lib/regexp_crawler.rb",
      "lib/regexp_crawler/crawler.rb",
+     "lib/regexp_crawler/http.rb",
      "regexp_crawler.gemspec",
      "spec/regexp_crawler_spec.rb",
      "spec/resources/complex.html",
      "spec/resources/nested1.html",
      "spec/resources/nested2.html",
+     "spec/resources/nested21.html",
      "spec/resources/simple.html",
      "spec/spec.opts",
      "spec/spec_helper.rb"

data/spec/regexp_crawler_spec.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
 describe RegexpCrawler::Crawler do
-  describe '#simple html' do
+  context '#simple html' do
     it 'should parse data according to regexp' do
       success_page('/resources/simple.html', 'http://simple.com/')
@@ -17,7 +17,7 @@ describe RegexpCrawler::Crawler do
     end
   end
-  describe '#complex html' do
+  context '#complex html' do
     before(:each) do
       success_page('/resources/complex.html', 'http://complex.com/')
       success_page('/resources/nested1.html', 'http://complex.com/nested1.html')
@@ -27,7 +27,7 @@ describe RegexpCrawler::Crawler do
     it 'should parse data according to regexp' do
       crawl = RegexpCrawler::Crawler.new
       crawl.start_page = 'http://complex.com/'
-      crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
+      crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
       crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
       crawl.named_captures = ['title', 'date', 'body']
       crawl.model = 'post'
@@ -37,10 +37,24 @@ describe RegexpCrawler::Crawler do
       results.last[:post][:title].should == 'nested2'
     end
+    it 'should parse nested of nested data' do
+      success_page('/resources/nested21.html', 'http://complex.com/nested21.html')
+      crawl = RegexpCrawler::Crawler.new
+      crawl.start_page = 'http://complex.com/'
+      crawl.continue_regexp = %r{(?:http://complex.com)?/?nested\d+.html}
+      crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
+      crawl.named_captures = ['title', 'date', 'body']
+      crawl.model = 'post'
+      results = crawl.start
+      results.size.should == 3
+      results.first[:post][:title].should == 'nested1'
+      results.last[:post][:title].should == 'nested21'
+    end
     it "should save by myself" do
       crawl = RegexpCrawler::Crawler.new
       crawl.start_page = 'http://complex.com/'
-      crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
+      crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
       crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
       crawl.named_captures = ['title', 'date', 'body']
       my_results = []
@@ -53,7 +67,7 @@ describe RegexpCrawler::Crawler do
     it "should stop parse" do
       crawl = RegexpCrawler::Crawler.new
       crawl.start_page = 'http://complex.com/'
-      crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
+      crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
       crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
       crawl.named_captures = ['title', 'date', 'body']
       stop_page = "http://complex.com/nested1.html"
@@ -76,12 +90,12 @@ describe RegexpCrawler::Crawler do
     http = mock(Net::HTTPSuccess)
     http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true)
     http.stubs(:body).returns(content)
-    Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
+    Net::HTTP.expects(:get_response_with_headers).times(1).with(URI.parse(remote_path), nil).returns(http)
   end
   def redirect_page(remote_path, redirect_path)
     http = mock(Net::HTTPRedirection)
     http.stubs(:is_a?).with(Net::HTTPRedirection).returns(true)
-    Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
+    Net::HTTP.expects(:get_response_with_headers).times(1).with(URI.parse(remote_path), nil).returns(http)
   end
 end

data/spec/resources/complex.html CHANGED Viewed

@@ -4,8 +4,8 @@
   </head>
   <body>
     <div>
-      <a link="nested1.html">nested1</a>
-      <a link="http://complex.com/nested2.html">nested2</a>
+      <a href="/nested1.html">nested1</a>
+      <a href="http://complex.com/nested2.html">nested2</a>
     </div>
   </body>
 </html>

data/spec/resources/nested2.html CHANGED Viewed

@@ -7,6 +7,7 @@
       <div class="title">nested2</div>
       <div class="date">2008/10/10</div>
       <div class="body"><p class="content">nested2</p></div>
+      <a href="nested21.html">nested21</a>
     </div>
   </body>
 </html>

data/spec/resources/nested21.html ADDED Viewed

@@ -0,0 +1,12 @@
+<html>
+  <head>
+    <title>nested2 test html</title>
+  </head>
+  <body>
+    <div>
+      <div class="title">nested21</div>
+      <div class="date">2008/11/11</div>
+      <div class="body"><p class="content">nested21</p></div>
+    </div>
+  </body>
+</html>

data/spec/spec_helper.rb CHANGED Viewed

@@ -5,3 +5,4 @@ require 'mocha'
 require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler.rb')
 require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler/crawler.rb')
+require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler/http.rb')

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: flyerhzm-regexp_crawler
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  version: 0.6.0
 platform: ruby
 authors:
 - Richard Huang
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-29 00:00:00 -07:00
+date: 2009-08-30 00:00:00 -07:00
 default_executable:
 dependencies: []
@@ -31,17 +31,18 @@ files:
 - init.rb
 - lib/regexp_crawler.rb
 - lib/regexp_crawler/crawler.rb
+- lib/regexp_crawler/http.rb
 - regexp_crawler.gemspec
 - spec/regexp_crawler_spec.rb
 - spec/resources/complex.html
 - spec/resources/nested1.html
 - spec/resources/nested2.html
+- spec/resources/nested21.html
 - spec/resources/simple.html
 - spec/spec.opts
 - spec/spec_helper.rb
 has_rdoc: false
 homepage: ""
-licenses:
 post_install_message:
 rdoc_options:
 - --charset=UTF-8
@@ -62,7 +63,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.5
+rubygems_version: 1.2.0
 signing_key:
 specification_version: 3
 summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.