flyerhzm-regexp_crawler 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +5 -5
- data/VERSION +1 -1
- data/example/github_projects.rb +19 -0
- data/lib/regexp_crawler/crawler.rb +2 -2
- data/regexp_crawler.gemspec +3 -2
- data/spec/regexp_crawler_spec.rb +1 -1
- metadata +3 -2
data/README.textile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
h1. RegexpCrawler
|
2
2
|
|
3
|
-
RegexpCrawler is a crawler which uses
|
3
|
+
RegexpCrawler is a crawler which uses regular expression to catch data from website. It is easy to use and less code if you are familiar with regular expression.
|
4
4
|
|
5
5
|
**************************************************************************
|
6
6
|
|
@@ -26,8 +26,8 @@ options is a hash
|
|
26
26
|
* <code>:continue_regexp</code>, optional, a regexp to define what website urls the crawler continue to crawl, it is parsed by String#scan and get the first not nil result
|
27
27
|
* <code>:capture_regexp</code>, mandatory, a regexp to define what contents the crawler crawl, it is parse by Regexp#match and get all group captures
|
28
28
|
* <code>:named_captures</code>, mandatory, a string array to define the names of captured groups according to :capture_regexp
|
29
|
-
* <code>:model</code>,
|
30
|
-
* <code>:save_method</code>,
|
29
|
+
* <code>:model</code>, optional if :save_method defined, a string of result's model class
|
30
|
+
* <code>:save_method</code>, optional if :model defined, a proc to define how to save the result which the crawler crawled, the proc accept two parameters, first is one page crawled result, second is the crawled url
|
31
31
|
* <code>:headers</code>, optional, a hash to define http headers
|
32
32
|
* <code>:encoding</code>, optional, a string of the coding of crawled page, the results will be converted to utf8
|
33
33
|
* <code>:need_parse</code>, optional, a proc if parsing the page by regexp or not, the proc accept two parameters, first is the crawled website uri, second is the response body of crawled page
|
@@ -76,9 +76,9 @@ A rails plugin/gem to kill N+1 queries and unused eager loading
|
|
76
76
|
=============================
|
77
77
|
http://github.com/flyerhzm/regexp_crawler/tree/master
|
78
78
|
regexp_crawler
|
79
|
-
A crawler which use
|
79
|
+
A crawler which use regular expression to catch data.
|
80
80
|
<div class="wikistyle"><h1>RegexpCrawler</h1>
|
81
|
-
<p>RegexpCrawler is a crawler which use
|
81
|
+
<p>RegexpCrawler is a crawler which use regex expressi...
|
82
82
|
=============================
|
83
83
|
http://github.com/flyerhzm/sitemap/tree/master
|
84
84
|
sitemap
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.2
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'regexp_crawler'
|
3
|
+
|
4
|
+
crawler = RegexpCrawler::Crawler.new(
|
5
|
+
:start_page => "http://github.com/flyerhzm",
|
6
|
+
:continue_regexp => %r{<div class="title"><b><a href="(/flyerhzm/.*?)">}m,
|
7
|
+
:capture_regexp => %r{<a href="http://github.com/flyerhzm/[^/"]*?(?:/tree)?">(.*?)</a>.*<span id="repository_description".*?>(.*?)</span>.*(<div class="(?:wikistyle|plain)">.*?</div>)</div>}m,
|
8
|
+
:named_captures => ['title', 'description', 'body'],
|
9
|
+
:save_method => Proc.new do |result, page|
|
10
|
+
puts '============================='
|
11
|
+
puts page
|
12
|
+
puts result[:title]
|
13
|
+
puts result[:description]
|
14
|
+
puts result[:body][0..100] + "..."
|
15
|
+
end,
|
16
|
+
:need_parse => Proc.new do |page, response_body|
|
17
|
+
page =~ %r{http://github.com/flyerhzm/\w+} && !response_body.index(/Fork of.*?<a href=".*?">/)
|
18
|
+
end)
|
19
|
+
crawler.start
|
@@ -37,7 +37,7 @@ module RegexpCrawler
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def continue_uri(uri, page)
|
40
|
-
if page
|
40
|
+
if page =~ /^#{uri.scheme}/
|
41
41
|
URI.parse(page)
|
42
42
|
elsif page.start_with?('/')
|
43
43
|
URI.join(uri.scheme + '://' + uri.host, page)
|
@@ -56,7 +56,7 @@ module RegexpCrawler
|
|
56
56
|
@pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
|
57
57
|
end
|
58
58
|
end
|
59
|
-
if @need_parse.nil? or @need_parse.call(uri.
|
59
|
+
if @need_parse.nil? or @need_parse.call(uri.to_s, response_body)
|
60
60
|
md = @capture_regexp.match(response_body)
|
61
61
|
if md
|
62
62
|
captures = md.captures
|
data/regexp_crawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{regexp_crawler}
|
8
|
-
s.version = "0.8.
|
8
|
+
s.version = "0.8.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Richard Huang"]
|
12
|
-
s.date = %q{2009-09-
|
12
|
+
s.date = %q{2009-09-13}
|
13
13
|
s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
|
14
14
|
s.email = %q{flyerhzm@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
|
|
22
22
|
"Rakefile",
|
23
23
|
"TODO",
|
24
24
|
"VERSION",
|
25
|
+
"example/github_projects.rb",
|
25
26
|
"init.rb",
|
26
27
|
"lib/regexp_crawler.rb",
|
27
28
|
"lib/regexp_crawler/crawler.rb",
|
data/spec/regexp_crawler_spec.rb
CHANGED
@@ -91,7 +91,7 @@ describe RegexpCrawler::Crawler do
|
|
91
91
|
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
|
92
92
|
crawl.named_captures = ['title', 'date', 'body']
|
93
93
|
crawl.model = 'post'
|
94
|
-
crawl.need_parse = Proc.new do |
|
94
|
+
crawl.need_parse = Proc.new do |page, response_body|
|
95
95
|
if response_body.index('nested2 test html')
|
96
96
|
false
|
97
97
|
else
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flyerhzm-regexp_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Huang
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-09-
|
12
|
+
date: 2009-09-13 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- Rakefile
|
29
29
|
- TODO
|
30
30
|
- VERSION
|
31
|
+
- example/github_projects.rb
|
31
32
|
- init.rb
|
32
33
|
- lib/regexp_crawler.rb
|
33
34
|
- lib/regexp_crawler/crawler.rb
|