flyerhzm-regexp_crawler 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +23 -0
- data/init.rb +0 -3
- data/lib/regexp_crawler/crawler.rb +25 -22
- data/regexp_crawler.gemspec +7 -4
- data/spec/regexp_crawler_spec.rb +5 -6
- metadata +4 -4
- data/README +0 -20
data/README.textile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
h1. RegexpCrawler
|
2
|
+
|
3
|
+
RegexpCrawler is a crawler which use regrex expression to catch data.
|
4
|
+
|
5
|
+
**************************************************************************
|
6
|
+
|
7
|
+
h2. Install
|
8
|
+
|
9
|
+
<pre><code>
|
10
|
+
gem sources -a http://gems.github.com
|
11
|
+
gem install flyerhzm-regexp_crawler
|
12
|
+
</code></pre>
|
13
|
+
|
14
|
+
**************************************************************************
|
15
|
+
|
16
|
+
h2. Usage
|
17
|
+
|
18
|
+
<pre><code>
|
19
|
+
>> crawler = RegexpCrawler::Crawler.new(:start_page => "http://www.tijee.com/tags/64-google-face-questions/posts", :continue_regexp => %r{"(/posts/\d+-[^#]*?)"}, :capture_regexp => %r{<h2 class='title'><a.*?>(.*?)</a></h2>.*?<div class='body'>(.*?)</div>}m, :named_captures => ['title', 'body'], :model => 'post')
|
20
|
+
>> crawler.start
|
21
|
+
|
22
|
+
=>[{:page=>"http://www.tijee.com/posts/327-google-face-questions-many-companies-will-ask-oh", :post=>{:title=>"Google面试题(很多公司都会问的哦)", :body=>"\n内容摘要:几星期前,一个朋友接受..."}}, {:page=>"http://www.tijee.com/posts/328-java-surface-together-with-the-google-test", :post=>{:title=>"google的一道JAVA面试题", :body=>"\n内容摘要:有一个整数n,写一个函数f(n..."}}]
|
23
|
+
</code></pre>
|
data/init.rb
CHANGED
@@ -27,31 +27,34 @@ module RegexpCrawler
|
|
27
27
|
results
|
28
28
|
end
|
29
29
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
private
|
31
|
+
def parse_page(uri)
|
32
|
+
response = Net::HTTP.get_response(uri)
|
33
|
+
parse_response(response, uri)
|
34
|
+
end
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
36
|
+
def parse_response(response, uri)
|
37
|
+
if response.is_a? Net::HTTPSuccess
|
38
|
+
if continue_regexp
|
39
|
+
response.body.scan(continue_regexp).each do |page|
|
40
|
+
page = page.first if page.is_a? Array
|
41
|
+
continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
|
42
|
+
@pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
md = @capture_regexp.match(response.body)
|
46
|
+
if md
|
47
|
+
captures = md.captures if md
|
48
|
+
result = {}
|
49
|
+
captures.each_index do |i|
|
50
|
+
result[named_captures[i].to_sym] = captures[i]
|
51
|
+
end
|
52
|
+
{@model.downcase.to_sym => result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
|
48
53
|
end
|
49
|
-
|
54
|
+
elsif response.is_a? Net::HTTPRedirection
|
55
|
+
parse_page(URI.parse(response['location']))
|
56
|
+
else
|
50
57
|
end
|
51
|
-
elsif response.is_a? Net::HTTPRedirection
|
52
|
-
parse_page(URI.parse(response['location']))
|
53
|
-
else
|
54
58
|
end
|
55
59
|
end
|
56
|
-
end
|
57
60
|
end
|
data/regexp_crawler.gemspec
CHANGED
@@ -1,22 +1,25 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
1
4
|
# -*- encoding: utf-8 -*-
|
2
5
|
|
3
6
|
Gem::Specification.new do |s|
|
4
7
|
s.name = %q{regexp_crawler}
|
5
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
6
9
|
|
7
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
11
|
s.authors = ["Richard Huang"]
|
9
|
-
s.date = %q{2009-08-
|
12
|
+
s.date = %q{2009-08-22}
|
10
13
|
s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
|
11
14
|
s.email = %q{flyerhzm@gmail.com}
|
12
15
|
s.extra_rdoc_files = [
|
13
16
|
"LICENSE",
|
14
|
-
"README"
|
17
|
+
"README.textile"
|
15
18
|
]
|
16
19
|
s.files = [
|
17
20
|
".gitignore",
|
18
21
|
"LICENSE",
|
19
|
-
"README",
|
22
|
+
"README.textile",
|
20
23
|
"Rakefile",
|
21
24
|
"TODO",
|
22
25
|
"VERSION",
|
data/spec/regexp_crawler_spec.rb
CHANGED
@@ -1,17 +1,14 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
|
2
2
|
|
3
3
|
describe RegexpCrawler::Crawler do
|
4
|
-
class Post
|
5
|
-
attr_accessor :title, :date, :body
|
6
|
-
end
|
7
|
-
|
8
4
|
describe '#simple html' do
|
9
5
|
it 'should parse data according to regexp' do
|
10
6
|
success_page('/resources/simple.html', 'http://simple.com/')
|
11
7
|
|
12
|
-
crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model =>
|
8
|
+
crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post')
|
13
9
|
results = crawl.start
|
14
10
|
results.size.should == 1
|
11
|
+
results.first[:post][:title].should == 'test'
|
15
12
|
end
|
16
13
|
|
17
14
|
it 'should redirect' do
|
@@ -33,9 +30,11 @@ describe RegexpCrawler::Crawler do
|
|
33
30
|
crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
|
34
31
|
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
|
35
32
|
crawl.named_captures = ['title', 'date', 'body']
|
36
|
-
crawl.model =
|
33
|
+
crawl.model = 'post'
|
37
34
|
results = crawl.start
|
38
35
|
results.size.should == 2
|
36
|
+
results.first[:post][:title].should == 'nested1'
|
37
|
+
results.last[:post][:title].should == 'nested2'
|
39
38
|
end
|
40
39
|
end
|
41
40
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flyerhzm-regexp_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Huang
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-08-
|
12
|
+
date: 2009-08-22 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -21,11 +21,11 @@ extensions: []
|
|
21
21
|
|
22
22
|
extra_rdoc_files:
|
23
23
|
- LICENSE
|
24
|
-
- README
|
24
|
+
- README.textile
|
25
25
|
files:
|
26
26
|
- .gitignore
|
27
27
|
- LICENSE
|
28
|
-
- README
|
28
|
+
- README.textile
|
29
29
|
- Rakefile
|
30
30
|
- TODO
|
31
31
|
- VERSION
|
data/README
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
RegexpCrawler
|
2
|
-
============
|
3
|
-
|
4
|
-
RegexpCrawler is a crawler which use regrex expression to catch data.
|
5
|
-
|
6
|
-
|
7
|
-
Install
|
8
|
-
=======
|
9
|
-
|
10
|
-
gem sources -a http://gems.github.com
|
11
|
-
gem install flyerhzm-regexp_crawler
|
12
|
-
|
13
|
-
|
14
|
-
Usage
|
15
|
-
=====
|
16
|
-
|
17
|
-
>> crawler = RegexpCrawler::Crawler.new(:start_page => "http://www.tijee.com/tags/64-google-face-questions/posts", :continue_regexp => %r{"(/posts/\d+-[^#]*?)"}, :capture_regexp => %r{<h2 class='title'><a.*?>(.*?)</a></h2>.*?<div class='body'>(.*?)</div>}m, :named_captures => ['title', 'body'], :model => Post)
|
18
|
-
>> crawler.start
|
19
|
-
|
20
|
-
=>[{:page=>"http://www.tijee.com/posts/327-google-face-questions-many-companies-will-ask-oh", :model=>#<Post id: nil, title: "Google面试题(很多公司都会问的哦)", body: "\n内容摘要:几星期前,一个朋友接受...", created_at: nil, updated_at: nil, verify: false>}, {:page=>"http://www.tijee.com/posts/328-java-surface-together-with-the-google-test", :model=>#<Post id: nil, title: "google的一道JAVA面试题", body: "\n内容摘要:有一个整数n,写一个函数f(n...", created_at: nil, updated_at: nil, verify: false>}]
|