RubyGems - flyerhzm-regexp_crawler - Versions diffs - 0.1.0 - Mend

flyerhzm-regexp_crawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/.gitignore +1 -0
data/LICENSE +0 -0
data/README +1 -0
data/Rakefile +21 -0
data/TODO +0 -0
data/VERSION +1 -0
data/init.rb +3 -0
data/lib/regexp_crawler/crawler.rb +54 -0
data/lib/regexp_crawler.rb +23 -0
data/spec/regexp_crawler_spec.rb +56 -0
data/spec/resources/complex.html +11 -0
data/spec/resources/nested1.html +12 -0
data/spec/resources/nested2.html +12 -0
data/spec/resources/simple.html +12 -0
data/spec/spec.opts +8 -0
data/spec/spec_helper.rb +7 -0
metadata +71 -0

data/.gitignore ADDED Viewed

	@@ -0,0 +1 @@
1	+ tmp/**

data/LICENSE ADDED Viewed

File without changes

data/README ADDED Viewed

	@@ -0,0 +1 @@
1	+ A crawler which use regrex expression to catch data.

data/Rakefile ADDED Viewed

@@ -0,0 +1,21 @@
+require 'rake'
+require 'rake/rdoctask'
+require 'spec/rake/spectask'
+require 'jeweler'
+desc "Run all specs in spec directory"
+Spec::Rake::SpecTask.new(:spec) do |t|
+  t.spec_files = FileList['spec/**/*_spec.rb']
+  t.rcov = true
+  t.rcov_opts = ['--exclude', 'spec,config,Library,usr/lib/ruby']
+  t.rcov_dir = File.join(File.dirname(__FILE__), "tmp")
+end
+Jeweler::Tasks.new do |gemspec|
+  gemspec.name = "regexp_crawler"
+  gemspec.summary = "RegexpCrawler is a Ruby library for crawl data from website using regular expression."
+  gemspec.description = "RegexpCrawler is a Ruby library for crawl data from website using regular expression."
+  gemspec.email = "flyerhzm@gmail.com"
+  gemspec.homepage = ""
+  gemspec.authors = ["Richard Huang"]
+end

data/TODO ADDED Viewed

File without changes

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.0

data/init.rb ADDED Viewed

@@ -0,0 +1,3 @@
+if defined?(ActiveRecord)
+  ActiveRecord::Base.send :include, RegexpCrawler
+end

data/lib/regexp_crawler/crawler.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module RegexpCrawler
+  class Crawler
+    attr_accessor :start_page, :continue_regexp, :named_captures, :model
+    def initialize(options = {})
+      @start_page = options[:start_page]
+      @continue_regexp = options[:continue_regexp]
+      @capture_regexp = options[:capture_regexp]
+      @named_captures = options[:named_captures]
+      @model = options[:model]
+    end
+    def capture_regexp=(regexp)
+      @capture_regexp = Regexp.new(regexp.source, regexp.options | Regexp::MULTILINE)
+    end
+    def start
+      results = []
+      @pages = [@start_page]
+      while !@pages.empty?
+        uri = URI.parse(@pages.shift)
+        result = parse_page(uri)
+        results << result if result
+      end
+      results
+    end
+    def parse_page(uri)
+      response = Net::HTTP.get_response(uri)
+      parse_response(response, uri)
+    end
+    def parse_response(response, uri)
+      if response.is_a? Net::HTTPSuccess
+        response.body.scan(continue_regexp).each do |page|
+          url = page.start_with?(uri.scheme) ? page : "#{uri.scheme}://#{uri.host}/#{page}"
+          @pages << url
+        end if continue_regexp
+        md = @capture_regexp.match(response.body)
+        if md
+          model_result = model.new
+          captures = md.captures if md
+          captures.each_index do |i|
+            model_result.send("#{named_captures[i]}=", captures[i])
+          end
+          {:model => model_result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
+        end
+      elsif response.is_a? Net::HTTPRedirection
+        parse_page(URI.parse(response['location']))
+      else
+      end
+    end
+  end
+end

data/lib/regexp_crawler.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require 'net/http'
+require 'uri'
+module RegexpCrawler
+  def self.included(base)
+    base.extend ClassMethods
+  end
+  module ClassMethods
+    def regexp_crawler(options)
+      @crawlers ||= []
+      @crawlers << Crawler.new(options)
+    end
+    def start_crawl
+      @crawlers.each do |crawler|
+        crawler.start
+      end
+    end
+  end
+end

data/spec/regexp_crawler_spec.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
+describe RegexpCrawler::Crawler do
+  class Post
+    attr_accessor :title, :date, :body
+  end
+  describe '#simple html' do
+    it 'should parse data according to regexp' do
+      success_page('/resources/simple.html', 'http://simple.com/')
+      crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => Post)
+      results = crawl.start
+      results.size.should == 1
+    end
+    it 'should redirect' do
+      redirect_page('http://redirect.com/', 'http://simple.com/')
+      success_page('/resources/simple.html', 'http://simple.com/')
+    end
+  end
+  describe '#complex html' do
+    before(:each) do
+      success_page('/resources/complex.html', 'http://complex.com/')
+      success_page('/resources/nested1.html', 'http://complex.com/nested1.html')
+      success_page('/resources/nested2.html', 'http://complex.com/nested2.html')
+    end
+    it 'should parse data according to regexp' do
+      crawl = RegexpCrawler::Crawler.new
+      crawl.start_page = 'http://complex.com/'
+      crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
+      crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
+      crawl.named_captures = ['title', 'date', 'body']
+      crawl.model = Post
+      results = crawl.start
+      results.size.should == 2
+    end
+  end
+  def success_page(local_path, remote_path)
+    path = File.expand_path(File.dirname(__FILE__) + local_path)
+    content = File.read(path)
+    http = mock(Net::HTTPSuccess)
+    http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true)
+    http.stubs(:body).returns(content)
+    Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
+  end
+  def redirect_page(remote_path, redirect_path)
+    http = mock(Net::HTTPRedirection)
+    http.stubs(:is_a?).with(Net::HTTPRedirection).returns(true)
+    Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
+  end
+end

data/spec/resources/complex.html ADDED Viewed

@@ -0,0 +1,11 @@
+<html>
+  <head>
+    <title>complex test html</title>
+  </head>
+  <body>
+    <div>
+      <a link="nested1.html">nested1</a>
+      <a link="http://complex.com/nested2.html">nested2</a>
+    </div>
+  </body>
+</html>

data/spec/resources/nested1.html ADDED Viewed

@@ -0,0 +1,12 @@
+<html>
+  <head>
+    <title>nested1 test html</title>
+  </head>
+  <body>
+    <div>
+      <div class="title">nested1</div>
+      <div class="date">2008/10/10</div>
+      <div class="body"><p class="content">nested1</p></div>
+    </div>
+  </body>
+</html>

data/spec/resources/nested2.html ADDED Viewed

@@ -0,0 +1,12 @@
+<html>
+  <head>
+    <title>nested2 test html</title>
+  </head>
+  <body>
+    <div>
+      <div class="title">nested2</div>
+      <div class="date">2008/10/10</div>
+      <div class="body"><p class="content">nested2</p></div>
+    </div>
+  </body>
+</html>

data/spec/resources/simple.html ADDED Viewed

@@ -0,0 +1,12 @@
+<html>
+  <head>
+    <title>simple test html</title>
+  </head>
+  <body>
+    <div>
+      <div class="title">test</div>
+      <div class="date">2008/09/10</div>
+      <div class="body"><p class="content">test</p></div>
+    </div>
+  </body>
+</html>

data/spec/spec.opts ADDED Viewed

@@ -0,0 +1,8 @@
+--colour
+--format
+specdoc
+--reverse
+--timeout
+20
+--loadby
+mtime

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require 'rubygems'
+require 'spec/autorun'
+require 'date'
+require 'mocha'
+require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler.rb')
+require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler/crawler.rb')

metadata ADDED Viewed

@@ -0,0 +1,71 @@
+--- !ruby/object:Gem::Specification
+name: flyerhzm-regexp_crawler
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Richard Huang
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-08-02 00:00:00 -07:00
+default_executable:
+dependencies: []
+description: RegexpCrawler is a Ruby library for crawl data from website using regular expression.
+email: flyerhzm@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README
+files:
+- .gitignore
+- LICENSE
+- README
+- Rakefile
+- TODO
+- VERSION
+- init.rb
+- lib/regexp_crawler.rb
+- lib/regexp_crawler/crawler.rb
+- spec/regexp_crawler_spec.rb
+- spec/resources/complex.html
+- spec/resources/nested1.html
+- spec/resources/nested2.html
+- spec/resources/simple.html
+- spec/spec.opts
+- spec/spec_helper.rb
+has_rdoc: false
+homepage: ""
+licenses:
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.
+test_files:
+- spec/spec_helper.rb
+- spec/regexp_crawler_spec.rb