flyerhzm-regexp_crawler 0.8.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +6 -5
- data/VERSION +1 -1
- data/lib/regexp_crawler.rb +1 -0
- data/lib/regexp_crawler/crawler.rb +7 -0
- data/regexp_crawler.gemspec +1 -2
- data/spec/regexp_crawler_spec.rb +1 -1
- metadata +1 -2
- data/TODO +0 -0
data/README.textile
CHANGED
@@ -31,6 +31,7 @@ options is a hash
|
|
31
31
|
* <code>:headers</code>, optional, a hash to define http headers
|
32
32
|
* <code>:encoding</code>, optional, a string of the coding of crawled page, the results will be converted to utf8
|
33
33
|
* <code>:need_parse</code>, optional, a proc if parsing the page by regexp or not, the proc accept two parameters, first is the crawled website uri, second is the response body of crawled page
|
34
|
+
* <code>:logger</code>, optional, true for logging to STDOUT, or a Logger object for logging to that logger
|
34
35
|
|
35
36
|
If the crawler define :model no :save_method, the RegexpCrawler::Crawler#start will return an array of results, such as
|
36
37
|
<pre><code>
|
@@ -41,7 +42,7 @@ If the crawler define :model no :save_method, the RegexpCrawler::Crawler#start w
|
|
41
42
|
|
42
43
|
h2. Example
|
43
44
|
|
44
|
-
a script to synchronize your github projects except fork projects
|
45
|
+
a script to synchronize your github projects except fork projects, please check <code>example/github_projects.rb</code>
|
45
46
|
|
46
47
|
<pre><code>
|
47
48
|
require 'rubygems'
|
@@ -49,8 +50,8 @@ require 'regexp_crawler'
|
|
49
50
|
|
50
51
|
crawler = RegexpCrawler::Crawler.new(
|
51
52
|
:start_page => "http://github.com/flyerhzm",
|
52
|
-
:continue_regexp => %r{<div class="title"><b><a href="(/flyerhzm
|
53
|
-
:capture_regexp => %r{<a href="http://github.com/flyerhzm
|
53
|
+
:continue_regexp => %r{<div class="title"><b><a href="(/flyerhzm/.*?)">}m,
|
54
|
+
:capture_regexp => %r{<a href="http://github.com/flyerhzm/[^/"]*?(?:/tree)?">(.*?)</a>.*<span id="repository_description".*?>(.*?)</span>.*(<div class="(?:wikistyle|plain)">.*?</div>)</div>}m,
|
54
55
|
:named_captures => ['title', 'description', 'body'],
|
55
56
|
:save_method => Proc.new do |result, page|
|
56
57
|
puts '============================='
|
@@ -60,10 +61,10 @@ crawler = RegexpCrawler::Crawler.new(
|
|
60
61
|
puts result[:body][0..100] + "..."
|
61
62
|
end,
|
62
63
|
:need_parse => Proc.new do |page, response_body|
|
63
|
-
!response_body.index
|
64
|
+
page =~ %r{http://github.com/flyerhzm/\w+} && !response_body.index(/Fork of.*?<a href=".*?">/)
|
64
65
|
end)
|
65
66
|
crawler.start
|
66
|
-
</code
|
67
|
+
</pre></code>
|
67
68
|
|
68
69
|
The results are as follows:
|
69
70
|
<pre><code>
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.0
|
data/lib/regexp_crawler.rb
CHANGED
@@ -12,6 +12,7 @@ module RegexpCrawler
|
|
12
12
|
@headers = options[:headers]
|
13
13
|
@encoding = options[:encoding]
|
14
14
|
@need_parse = options[:need_parse]
|
15
|
+
@logger = options[:logger] == true ? Logger.new(STDOUT) : options[:logger]
|
15
16
|
end
|
16
17
|
|
17
18
|
def capture_regexp=(regexp)
|
@@ -32,6 +33,7 @@ module RegexpCrawler
|
|
32
33
|
|
33
34
|
private
|
34
35
|
def parse_page(uri)
|
36
|
+
@logger.debug "crawling page: #{uri.to_s}" if @logger
|
35
37
|
response = Net::HTTP.get_response_with_headers(uri, @headers)
|
36
38
|
parse_response(response, uri)
|
37
39
|
end
|
@@ -49,8 +51,10 @@ module RegexpCrawler
|
|
49
51
|
def parse_response(response, uri)
|
50
52
|
response_body = encoding.nil? ? response.body : Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first
|
51
53
|
if response.is_a? Net::HTTPSuccess
|
54
|
+
@logger.debug "crawling success: #{uri.to_s}" if @logger
|
52
55
|
if continue_regexp
|
53
56
|
response_body.scan(continue_regexp).each do |page|
|
57
|
+
@logger.debug "continue_page: #{page}" if @logger
|
54
58
|
page = page.compact.first if page.is_a? Array
|
55
59
|
continue_uri = continue_uri(uri, page)
|
56
60
|
@pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
|
@@ -59,6 +63,7 @@ module RegexpCrawler
|
|
59
63
|
if @need_parse.nil? or @need_parse.call(uri.to_s, response_body)
|
60
64
|
md = @capture_regexp.match(response_body)
|
61
65
|
if md
|
66
|
+
@logger.debug "response body captured" if @logger
|
62
67
|
captures = md.captures
|
63
68
|
result = {}
|
64
69
|
captures.each_index do |i|
|
@@ -73,8 +78,10 @@ module RegexpCrawler
|
|
73
78
|
end
|
74
79
|
end
|
75
80
|
elsif response.is_a? Net::HTTPRedirection
|
81
|
+
@logger.debug "crawling redirect: #{response['location']}" if @logger
|
76
82
|
parse_page(URI.parse(response['location']))
|
77
83
|
else
|
84
|
+
@logger.debug "crawling nothing: #{uri.to_s}" if @logger
|
78
85
|
# do nothing
|
79
86
|
end
|
80
87
|
end
|
data/regexp_crawler.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{regexp_crawler}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.9.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Richard Huang"]
|
@@ -20,7 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
"LICENSE",
|
21
21
|
"README.textile",
|
22
22
|
"Rakefile",
|
23
|
-
"TODO",
|
24
23
|
"VERSION",
|
25
24
|
"example/github_projects.rb",
|
26
25
|
"init.rb",
|
data/spec/regexp_crawler_spec.rb
CHANGED
@@ -5,7 +5,7 @@ describe RegexpCrawler::Crawler do
|
|
5
5
|
it 'should parse data according to regexp' do
|
6
6
|
success_page('/resources/simple.html', 'http://simple.com/')
|
7
7
|
|
8
|
-
crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post')
|
8
|
+
crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post', :logger => true)
|
9
9
|
results = crawl.start
|
10
10
|
results.size.should == 1
|
11
11
|
results.first[:post][:title].should == 'test'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flyerhzm-regexp_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Huang
|
@@ -26,7 +26,6 @@ files:
|
|
26
26
|
- LICENSE
|
27
27
|
- README.textile
|
28
28
|
- Rakefile
|
29
|
-
- TODO
|
30
29
|
- VERSION
|
31
30
|
- example/github_projects.rb
|
32
31
|
- init.rb
|
data/TODO
DELETED
File without changes
|