flyerhzm-regexp_crawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ tmp/**
data/LICENSE ADDED
File without changes
data/README ADDED
@@ -0,0 +1 @@
1
+ A crawler which use regrex expression to catch data.
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require 'rake'
2
+ require 'rake/rdoctask'
3
+ require 'spec/rake/spectask'
4
+ require 'jeweler'
5
+
6
+ desc "Run all specs in spec directory"
7
+ Spec::Rake::SpecTask.new(:spec) do |t|
8
+ t.spec_files = FileList['spec/**/*_spec.rb']
9
+ t.rcov = true
10
+ t.rcov_opts = ['--exclude', 'spec,config,Library,usr/lib/ruby']
11
+ t.rcov_dir = File.join(File.dirname(__FILE__), "tmp")
12
+ end
13
+
14
+ Jeweler::Tasks.new do |gemspec|
15
+ gemspec.name = "regexp_crawler"
16
+ gemspec.summary = "RegexpCrawler is a Ruby library for crawl data from website using regular expression."
17
+ gemspec.description = "RegexpCrawler is a Ruby library for crawl data from website using regular expression."
18
+ gemspec.email = "flyerhzm@gmail.com"
19
+ gemspec.homepage = ""
20
+ gemspec.authors = ["Richard Huang"]
21
+ end
data/TODO ADDED
File without changes
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ if defined?(ActiveRecord)
2
+ ActiveRecord::Base.send :include, RegexpCrawler
3
+ end
@@ -0,0 +1,54 @@
1
+ module RegexpCrawler
2
+ class Crawler
3
+ attr_accessor :start_page, :continue_regexp, :named_captures, :model
4
+
5
+ def initialize(options = {})
6
+ @start_page = options[:start_page]
7
+ @continue_regexp = options[:continue_regexp]
8
+ @capture_regexp = options[:capture_regexp]
9
+ @named_captures = options[:named_captures]
10
+ @model = options[:model]
11
+ end
12
+
13
+ def capture_regexp=(regexp)
14
+ @capture_regexp = Regexp.new(regexp.source, regexp.options | Regexp::MULTILINE)
15
+ end
16
+
17
+ def start
18
+ results = []
19
+ @pages = [@start_page]
20
+ while !@pages.empty?
21
+ uri = URI.parse(@pages.shift)
22
+ result = parse_page(uri)
23
+ results << result if result
24
+ end
25
+ results
26
+ end
27
+
28
+ def parse_page(uri)
29
+ response = Net::HTTP.get_response(uri)
30
+ parse_response(response, uri)
31
+ end
32
+
33
+ def parse_response(response, uri)
34
+ if response.is_a? Net::HTTPSuccess
35
+ response.body.scan(continue_regexp).each do |page|
36
+ url = page.start_with?(uri.scheme) ? page : "#{uri.scheme}://#{uri.host}/#{page}"
37
+ @pages << url
38
+ end if continue_regexp
39
+ md = @capture_regexp.match(response.body)
40
+ if md
41
+ model_result = model.new
42
+ captures = md.captures if md
43
+ captures.each_index do |i|
44
+ model_result.send("#{named_captures[i]}=", captures[i])
45
+ end
46
+ {:model => model_result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
47
+ end
48
+ elsif response.is_a? Net::HTTPRedirection
49
+ parse_page(URI.parse(response['location']))
50
+ else
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,23 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module RegexpCrawler
5
+
6
+ def self.included(base)
7
+ base.extend ClassMethods
8
+ end
9
+
10
+ module ClassMethods
11
+ def regexp_crawler(options)
12
+ @crawlers ||= []
13
+ @crawlers << Crawler.new(options)
14
+ end
15
+
16
+ def start_crawl
17
+ @crawlers.each do |crawler|
18
+ crawler.start
19
+ end
20
+ end
21
+ end
22
+
23
+ end
@@ -0,0 +1,56 @@
1
+ require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
2
+
3
+ describe RegexpCrawler::Crawler do
4
+ class Post
5
+ attr_accessor :title, :date, :body
6
+ end
7
+
8
+ describe '#simple html' do
9
+ it 'should parse data according to regexp' do
10
+ success_page('/resources/simple.html', 'http://simple.com/')
11
+
12
+ crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => Post)
13
+ results = crawl.start
14
+ results.size.should == 1
15
+ end
16
+
17
+ it 'should redirect' do
18
+ redirect_page('http://redirect.com/', 'http://simple.com/')
19
+ success_page('/resources/simple.html', 'http://simple.com/')
20
+ end
21
+ end
22
+
23
+ describe '#complex html' do
24
+ before(:each) do
25
+ success_page('/resources/complex.html', 'http://complex.com/')
26
+ success_page('/resources/nested1.html', 'http://complex.com/nested1.html')
27
+ success_page('/resources/nested2.html', 'http://complex.com/nested2.html')
28
+ end
29
+
30
+ it 'should parse data according to regexp' do
31
+ crawl = RegexpCrawler::Crawler.new
32
+ crawl.start_page = 'http://complex.com/'
33
+ crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
34
+ crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
35
+ crawl.named_captures = ['title', 'date', 'body']
36
+ crawl.model = Post
37
+ results = crawl.start
38
+ results.size.should == 2
39
+ end
40
+ end
41
+
42
+ def success_page(local_path, remote_path)
43
+ path = File.expand_path(File.dirname(__FILE__) + local_path)
44
+ content = File.read(path)
45
+ http = mock(Net::HTTPSuccess)
46
+ http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true)
47
+ http.stubs(:body).returns(content)
48
+ Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
49
+ end
50
+
51
+ def redirect_page(remote_path, redirect_path)
52
+ http = mock(Net::HTTPRedirection)
53
+ http.stubs(:is_a?).with(Net::HTTPRedirection).returns(true)
54
+ Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
55
+ end
56
+ end
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ <title>complex test html</title>
4
+ </head>
5
+ <body>
6
+ <div>
7
+ <a link="nested1.html">nested1</a>
8
+ <a link="http://complex.com/nested2.html">nested2</a>
9
+ </div>
10
+ </body>
11
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ <title>nested1 test html</title>
4
+ </head>
5
+ <body>
6
+ <div>
7
+ <div class="title">nested1</div>
8
+ <div class="date">2008/10/10</div>
9
+ <div class="body"><p class="content">nested1</p></div>
10
+ </div>
11
+ </body>
12
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ <title>nested2 test html</title>
4
+ </head>
5
+ <body>
6
+ <div>
7
+ <div class="title">nested2</div>
8
+ <div class="date">2008/10/10</div>
9
+ <div class="body"><p class="content">nested2</p></div>
10
+ </div>
11
+ </body>
12
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ <title>simple test html</title>
4
+ </head>
5
+ <body>
6
+ <div>
7
+ <div class="title">test</div>
8
+ <div class="date">2008/09/10</div>
9
+ <div class="body"><p class="content">test</p></div>
10
+ </div>
11
+ </body>
12
+ </html>
data/spec/spec.opts ADDED
@@ -0,0 +1,8 @@
1
+ --colour
2
+ --format
3
+ specdoc
4
+ --reverse
5
+ --timeout
6
+ 20
7
+ --loadby
8
+ mtime
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'spec/autorun'
3
+ require 'date'
4
+ require 'mocha'
5
+
6
+ require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler.rb')
7
+ require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler/crawler.rb')
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: flyerhzm-regexp_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Richard Huang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-02 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: RegexpCrawler is a Ruby library for crawl data from website using regular expression.
17
+ email: flyerhzm@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README
25
+ files:
26
+ - .gitignore
27
+ - LICENSE
28
+ - README
29
+ - Rakefile
30
+ - TODO
31
+ - VERSION
32
+ - init.rb
33
+ - lib/regexp_crawler.rb
34
+ - lib/regexp_crawler/crawler.rb
35
+ - spec/regexp_crawler_spec.rb
36
+ - spec/resources/complex.html
37
+ - spec/resources/nested1.html
38
+ - spec/resources/nested2.html
39
+ - spec/resources/simple.html
40
+ - spec/spec.opts
41
+ - spec/spec_helper.rb
42
+ has_rdoc: false
43
+ homepage: ""
44
+ licenses:
45
+ post_install_message:
46
+ rdoc_options:
47
+ - --charset=UTF-8
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ requirements: []
63
+
64
+ rubyforge_project:
65
+ rubygems_version: 1.3.5
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: RegexpCrawler is a Ruby library for crawl data from website using regular expression.
69
+ test_files:
70
+ - spec/spec_helper.rb
71
+ - spec/regexp_crawler_spec.rb