flyerhzm-regexp_crawler 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile ADDED
@@ -0,0 +1,23 @@
1
+ h1. RegexpCrawler
2
+
3
+ RegexpCrawler is a crawler which use regrex expression to catch data.
4
+
5
+ **************************************************************************
6
+
7
+ h2. Install
8
+
9
+ <pre><code>
10
+ gem sources -a http://gems.github.com
11
+ gem install flyerhzm-regexp_crawler
12
+ </code></pre>
13
+
14
+ **************************************************************************
15
+
16
+ h2. Usage
17
+
18
+ <pre><code>
19
+ >> crawler = RegexpCrawler::Crawler.new(:start_page => "http://www.tijee.com/tags/64-google-face-questions/posts", :continue_regexp => %r{"(/posts/\d+-[^#]*?)"}, :capture_regexp => %r{<h2 class='title'><a.*?>(.*?)</a></h2>.*?<div class='body'>(.*?)</div>}m, :named_captures => ['title', 'body'], :model => 'post')
20
+ >> crawler.start
21
+
22
+ =>[{:page=>"http://www.tijee.com/posts/327-google-face-questions-many-companies-will-ask-oh", :post=>{:title=>"Google面试题(很多公司都会问的哦)", :body=>"\n内容摘要:几星期前,一个朋友接受..."}}, {:page=>"http://www.tijee.com/posts/328-java-surface-together-with-the-google-test", :post=>{:title=>"google的一道JAVA面试题", :body=>"\n内容摘要:有一个整数n,写一个函数f(n..."}}]
23
+ </code></pre>
data/init.rb CHANGED
@@ -1,3 +0,0 @@
1
- if defined?(ActiveRecord)
2
- ActiveRecord::Base.send :include, RegexpCrawler
3
- end
@@ -27,31 +27,34 @@ module RegexpCrawler
27
27
  results
28
28
  end
29
29
 
30
- def parse_page(uri)
31
- response = Net::HTTP.get_response(uri)
32
- parse_response(response, uri)
33
- end
30
+ private
31
+ def parse_page(uri)
32
+ response = Net::HTTP.get_response(uri)
33
+ parse_response(response, uri)
34
+ end
34
35
 
35
- def parse_response(response, uri)
36
- if response.is_a? Net::HTTPSuccess
37
- response.body.scan(continue_regexp).each do |page|
38
- page = page.first if page.is_a? Array
39
- continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
40
- @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
41
- end if continue_regexp
42
- md = @capture_regexp.match(response.body)
43
- if md
44
- model_result = model.new
45
- captures = md.captures if md
46
- captures.each_index do |i|
47
- model_result.send("#{named_captures[i]}=", captures[i])
36
+ def parse_response(response, uri)
37
+ if response.is_a? Net::HTTPSuccess
38
+ if continue_regexp
39
+ response.body.scan(continue_regexp).each do |page|
40
+ page = page.first if page.is_a? Array
41
+ continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
42
+ @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
43
+ end
44
+ end
45
+ md = @capture_regexp.match(response.body)
46
+ if md
47
+ captures = md.captures if md
48
+ result = {}
49
+ captures.each_index do |i|
50
+ result[named_captures[i].to_sym] = captures[i]
51
+ end
52
+ {@model.downcase.to_sym => result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
48
53
  end
49
- {:model => model_result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
54
+ elsif response.is_a? Net::HTTPRedirection
55
+ parse_page(URI.parse(response['location']))
56
+ else
50
57
  end
51
- elsif response.is_a? Net::HTTPRedirection
52
- parse_page(URI.parse(response['location']))
53
- else
54
58
  end
55
59
  end
56
- end
57
60
  end
@@ -1,22 +1,25 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
1
4
  # -*- encoding: utf-8 -*-
2
5
 
3
6
  Gem::Specification.new do |s|
4
7
  s.name = %q{regexp_crawler}
5
- s.version = "0.2.0"
8
+ s.version = "0.3.0"
6
9
 
7
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
11
  s.authors = ["Richard Huang"]
9
- s.date = %q{2009-08-02}
12
+ s.date = %q{2009-08-22}
10
13
  s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
11
14
  s.email = %q{flyerhzm@gmail.com}
12
15
  s.extra_rdoc_files = [
13
16
  "LICENSE",
14
- "README"
17
+ "README.textile"
15
18
  ]
16
19
  s.files = [
17
20
  ".gitignore",
18
21
  "LICENSE",
19
- "README",
22
+ "README.textile",
20
23
  "Rakefile",
21
24
  "TODO",
22
25
  "VERSION",
@@ -1,17 +1,14 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
2
2
 
3
3
  describe RegexpCrawler::Crawler do
4
- class Post
5
- attr_accessor :title, :date, :body
6
- end
7
-
8
4
  describe '#simple html' do
9
5
  it 'should parse data according to regexp' do
10
6
  success_page('/resources/simple.html', 'http://simple.com/')
11
7
 
12
- crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => Post)
8
+ crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post')
13
9
  results = crawl.start
14
10
  results.size.should == 1
11
+ results.first[:post][:title].should == 'test'
15
12
  end
16
13
 
17
14
  it 'should redirect' do
@@ -33,9 +30,11 @@ describe RegexpCrawler::Crawler do
33
30
  crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
34
31
  crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
35
32
  crawl.named_captures = ['title', 'date', 'body']
36
- crawl.model = Post
33
+ crawl.model = 'post'
37
34
  results = crawl.start
38
35
  results.size.should == 2
36
+ results.first[:post][:title].should == 'nested1'
37
+ results.last[:post][:title].should == 'nested2'
39
38
  end
40
39
  end
41
40
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flyerhzm-regexp_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Huang
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-02 00:00:00 -07:00
12
+ date: 2009-08-22 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -21,11 +21,11 @@ extensions: []
21
21
 
22
22
  extra_rdoc_files:
23
23
  - LICENSE
24
- - README
24
+ - README.textile
25
25
  files:
26
26
  - .gitignore
27
27
  - LICENSE
28
- - README
28
+ - README.textile
29
29
  - Rakefile
30
30
  - TODO
31
31
  - VERSION
data/README DELETED
@@ -1,20 +0,0 @@
1
- RegexpCrawler
2
- ============
3
-
4
- RegexpCrawler is a crawler which use regrex expression to catch data.
5
-
6
-
7
- Install
8
- =======
9
-
10
- gem sources -a http://gems.github.com
11
- gem install flyerhzm-regexp_crawler
12
-
13
-
14
- Usage
15
- =====
16
-
17
- >> crawler = RegexpCrawler::Crawler.new(:start_page => "http://www.tijee.com/tags/64-google-face-questions/posts", :continue_regexp => %r{"(/posts/\d+-[^#]*?)"}, :capture_regexp => %r{<h2 class='title'><a.*?>(.*?)</a></h2>.*?<div class='body'>(.*?)</div>}m, :named_captures => ['title', 'body'], :model => Post)
18
- >> crawler.start
19
-
20
- =>[{:page=>"http://www.tijee.com/posts/327-google-face-questions-many-companies-will-ask-oh", :model=>#<Post id: nil, title: "Google面试题(很多公司都会问的哦)", body: "\n内容摘要:几星期前,一个朋友接受...", created_at: nil, updated_at: nil, verify: false>}, {:page=>"http://www.tijee.com/posts/328-java-surface-together-with-the-google-test", :model=>#<Post id: nil, title: "google的一道JAVA面试题", body: "\n内容摘要:有一个整数n,写一个函数f(n...", created_at: nil, updated_at: nil, verify: false>}]