RubyGems - flyerhzm-regexp_crawler - Versions diffs - 0.2.0 → 0.3.0 - Mend

flyerhzm-regexp_crawler 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.textile +23 -0
data/init.rb +0 -3
data/lib/regexp_crawler/crawler.rb +25 -22
data/regexp_crawler.gemspec +7 -4
data/spec/regexp_crawler_spec.rb +5 -6
metadata +4 -4
data/README +0 -20

data/README.textile ADDED Viewed

@@ -0,0 +1,23 @@
+h1. RegexpCrawler
+RegexpCrawler is a crawler which use regrex expression to catch data.
+**************************************************************************
+h2. Install
+<pre><code>
+gem sources -a http://gems.github.com
+gem install flyerhzm-regexp_crawler
+</code></pre>
+**************************************************************************
+h2. Usage
+<pre><code>
+>> crawler = RegexpCrawler::Crawler.new(:start_page => "http://www.tijee.com/tags/64-google-face-questions/posts", :continue_regexp => %r{"(/posts/\d+-[^#]*?)"}, :capture_regexp => %r{<h2 class='title'><a.*?>(.*?)</a></h2>.*?<div class='body'>(.*?)</div>}m, :named_captures => ['title', 'body'], :model => 'post')
+>> crawler.start
+=>[{:page=>"http://www.tijee.com/posts/327-google-face-questions-many-companies-will-ask-oh", :post=>{:title=>"Google面试题（很多公司都会问的哦）", :body=>"\n内容摘要：几星期前，一个朋友接受..."}}, {:page=>"http://www.tijee.com/posts/328-java-surface-together-with-the-google-test", :post=>{:title=>"google的一道JAVA面试题", :body=>"\n内容摘要：有一个整数n,写一个函数f(n..."}}]
+</code></pre>

data/init.rb CHANGED Viewed

@@ -1,3 +0,0 @@
-if defined?(ActiveRecord)
-  ActiveRecord::Base.send :include, RegexpCrawler
-end

data/lib/regexp_crawler/crawler.rb CHANGED Viewed

@@ -27,31 +27,34 @@ module RegexpCrawler
       results
     end
-    def parse_page(uri)
-      response = Net::HTTP.get_response(uri)
-      parse_response(response, uri)
-    end
+    private
+      def parse_page(uri)
+        response = Net::HTTP.get_response(uri)
+        parse_response(response, uri)
+      end
-    def parse_response(response, uri)
-      if response.is_a? Net::HTTPSuccess
-        response.body.scan(continue_regexp).each do |page|
-          page = page.first if page.is_a? Array
-          continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
-          @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
-        end if continue_regexp
-        md = @capture_regexp.match(response.body)
-        if md
-          model_result = model.new
-          captures = md.captures if md
-          captures.each_index do |i|
-            model_result.send("#{named_captures[i]}=", captures[i])
+      def parse_response(response, uri)
+        if response.is_a? Net::HTTPSuccess
+          if continue_regexp
+            response.body.scan(continue_regexp).each do |page|
+              page = page.first if page.is_a? Array
+              continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
+              @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
+            end
+          end
+          md = @capture_regexp.match(response.body)
+          if md
+            captures = md.captures if md
+            result = {}
+            captures.each_index do |i|
+              result[named_captures[i].to_sym] = captures[i]
+            end
+            {@model.downcase.to_sym => result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
           end
-          {:model => model_result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
+        elsif response.is_a? Net::HTTPRedirection
+          parse_page(URI.parse(response['location']))
+        else
         end
-      elsif response.is_a? Net::HTTPRedirection
-        parse_page(URI.parse(response['location']))
-      else
       end
     end
-  end
 end

data/regexp_crawler.gemspec CHANGED Viewed

@@ -1,22 +1,25 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE
+# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
 # -*- encoding: utf-8 -*-
 Gem::Specification.new do |s|
   s.name = %q{regexp_crawler}
-  s.version = "0.2.0"
+  s.version = "0.3.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Richard Huang"]
-  s.date = %q{2009-08-02}
+  s.date = %q{2009-08-22}
   s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
   s.email = %q{flyerhzm@gmail.com}
   s.extra_rdoc_files = [
     "LICENSE",
-     "README"
+     "README.textile"
   ]
   s.files = [
     ".gitignore",
      "LICENSE",
-     "README",
+     "README.textile",
      "Rakefile",
      "TODO",
      "VERSION",

data/spec/regexp_crawler_spec.rb CHANGED Viewed

@@ -1,17 +1,14 @@
 require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
 describe RegexpCrawler::Crawler do
-  class Post
-    attr_accessor :title, :date, :body
-  end
   describe '#simple html' do
     it 'should parse data according to regexp' do
       success_page('/resources/simple.html', 'http://simple.com/')
-      crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => Post)
+      crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post')
       results = crawl.start
       results.size.should == 1
+      results.first[:post][:title].should == 'test'
     end
     it 'should redirect' do
@@ -33,9 +30,11 @@ describe RegexpCrawler::Crawler do
       crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
       crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
       crawl.named_captures = ['title', 'date', 'body']
-      crawl.model = Post
+      crawl.model = 'post'
       results = crawl.start
       results.size.should == 2
+      results.first[:post][:title].should == 'nested1'
+      results.last[:post][:title].should == 'nested2'
     end
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: flyerhzm-regexp_crawler
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Richard Huang
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-02 00:00:00 -07:00
+date: 2009-08-22 00:00:00 -07:00
 default_executable:
 dependencies: []
@@ -21,11 +21,11 @@ extensions: []
 extra_rdoc_files:
 - LICENSE
-- README
+- README.textile
 files:
 - .gitignore
 - LICENSE
-- README
+- README.textile
 - Rakefile
 - TODO
 - VERSION

data/README DELETED Viewed

@@ -1,20 +0,0 @@
-RegexpCrawler
-============
-RegexpCrawler is a crawler which use regrex expression to catch data.
-Install
-=======
-gem sources -a http://gems.github.com
-gem install flyerhzm-regexp_crawler
-Usage
-=====
->> crawler = RegexpCrawler::Crawler.new(:start_page => "http://www.tijee.com/tags/64-google-face-questions/posts", :continue_regexp => %r{"(/posts/\d+-[^#]*?)"}, :capture_regexp => %r{<h2 class='title'><a.*?>(.*?)</a></h2>.*?<div class='body'>(.*?)</div>}m, :named_captures => ['title', 'body'], :model => Post)
->> crawler.start
-=>[{:page=>"http://www.tijee.com/posts/327-google-face-questions-many-companies-will-ask-oh", :model=>#<Post id: nil, title: "Google面试题（很多公司都会问的哦）", body: "\n内容摘要：几星期前，一个朋友接受...", created_at: nil, updated_at: nil, verify: false>}, {:page=>"http://www.tijee.com/posts/328-java-surface-together-with-the-google-test", :model=>#<Post id: nil, title: "google的一道JAVA面试题", body: "\n内容摘要：有一个整数n,写一个函数f(n...", created_at: nil, updated_at: nil, verify: false>}]