flyerhzm-regexp_crawler 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,6 +1,6 @@
1
1
  h1. RegexpCrawler
2
2
 
3
- RegexpCrawler is a crawler which uses regrex expression to catch data from website. It is easy to use and less code if you are familiar with regrex expression.
3
+ RegexpCrawler is a crawler which uses regular expression to catch data from website. It is easy to use and less code if you are familiar with regular expression.
4
4
 
5
5
  **************************************************************************
6
6
 
@@ -26,8 +26,8 @@ options is a hash
26
26
  * <code>:continue_regexp</code>, optional, a regexp to define what website urls the crawler continue to crawl, it is parsed by String#scan and get the first not nil result
27
27
  * <code>:capture_regexp</code>, mandatory, a regexp to define what contents the crawler crawl, it is parse by Regexp#match and get all group captures
28
28
  * <code>:named_captures</code>, mandatory, a string array to define the names of captured groups according to :capture_regexp
29
- * <code>:model</code>, :optional if :save_method defined, a string of result's model class
30
- * <code>:save_method</code>, :optional if :model defined, a proc to define how to save the result which the crawler crawled, the proc accept two parameters, first is one page crawled result, second is the crawled url
29
+ * <code>:model</code>, optional if :save_method defined, a string of result's model class
30
+ * <code>:save_method</code>, optional if :model defined, a proc to define how to save the result which the crawler crawled, the proc accept two parameters, first is one page crawled result, second is the crawled url
31
31
  * <code>:headers</code>, optional, a hash to define http headers
32
32
  * <code>:encoding</code>, optional, a string of the coding of crawled page, the results will be converted to utf8
33
33
  * <code>:need_parse</code>, optional, a proc if parsing the page by regexp or not, the proc accept two parameters, first is the crawled website uri, second is the response body of crawled page
@@ -76,9 +76,9 @@ A rails plugin/gem to kill N+1 queries and unused eager loading
76
76
  =============================
77
77
  http://github.com/flyerhzm/regexp_crawler/tree/master
78
78
  regexp_crawler
79
- A crawler which use regrex expression to catch data.
79
+ A crawler which use regular expression to catch data.
80
80
  <div class="wikistyle"><h1>RegexpCrawler</h1>
81
- <p>RegexpCrawler is a crawler which use regrex expressi...
81
+ <p>RegexpCrawler is a crawler which use regex expressi...
82
82
  =============================
83
83
  http://github.com/flyerhzm/sitemap/tree/master
84
84
  sitemap
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.1
1
+ 0.8.2
@@ -0,0 +1,19 @@
1
+ require 'rubygems'
2
+ require 'regexp_crawler'
3
+
4
+ crawler = RegexpCrawler::Crawler.new(
5
+ :start_page => "http://github.com/flyerhzm",
6
+ :continue_regexp => %r{<div class="title"><b><a href="(/flyerhzm/.*?)">}m,
7
+ :capture_regexp => %r{<a href="http://github.com/flyerhzm/[^/"]*?(?:/tree)?">(.*?)</a>.*<span id="repository_description".*?>(.*?)</span>.*(<div class="(?:wikistyle|plain)">.*?</div>)</div>}m,
8
+ :named_captures => ['title', 'description', 'body'],
9
+ :save_method => Proc.new do |result, page|
10
+ puts '============================='
11
+ puts page
12
+ puts result[:title]
13
+ puts result[:description]
14
+ puts result[:body][0..100] + "..."
15
+ end,
16
+ :need_parse => Proc.new do |page, response_body|
17
+ page =~ %r{http://github.com/flyerhzm/\w+} && !response_body.index(/Fork of.*?<a href=".*?">/)
18
+ end)
19
+ crawler.start
@@ -37,7 +37,7 @@ module RegexpCrawler
37
37
  end
38
38
 
39
39
  def continue_uri(uri, page)
40
- if page.start_with?(uri.scheme)
40
+ if page =~ /^#{uri.scheme}/
41
41
  URI.parse(page)
42
42
  elsif page.start_with?('/')
43
43
  URI.join(uri.scheme + '://' + uri.host, page)
@@ -56,7 +56,7 @@ module RegexpCrawler
56
56
  @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
57
57
  end
58
58
  end
59
- if @need_parse.nil? or @need_parse.call(uri.to_i, response_body)
59
+ if @need_parse.nil? or @need_parse.call(uri.to_s, response_body)
60
60
  md = @capture_regexp.match(response_body)
61
61
  if md
62
62
  captures = md.captures
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{regexp_crawler}
8
- s.version = "0.8.1"
8
+ s.version = "0.8.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Richard Huang"]
12
- s.date = %q{2009-09-12}
12
+ s.date = %q{2009-09-13}
13
13
  s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
14
14
  s.email = %q{flyerhzm@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
22
22
  "Rakefile",
23
23
  "TODO",
24
24
  "VERSION",
25
+ "example/github_projects.rb",
25
26
  "init.rb",
26
27
  "lib/regexp_crawler.rb",
27
28
  "lib/regexp_crawler/crawler.rb",
@@ -91,7 +91,7 @@ describe RegexpCrawler::Crawler do
91
91
  crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
92
92
  crawl.named_captures = ['title', 'date', 'body']
93
93
  crawl.model = 'post'
94
- crawl.need_parse = Proc.new do |uri, response_body|
94
+ crawl.need_parse = Proc.new do |page, response_body|
95
95
  if response_body.index('nested2 test html')
96
96
  false
97
97
  else
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flyerhzm-regexp_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Huang
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-12 00:00:00 -07:00
12
+ date: 2009-09-13 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -28,6 +28,7 @@ files:
28
28
  - Rakefile
29
29
  - TODO
30
30
  - VERSION
31
+ - example/github_projects.rb
31
32
  - init.rb
32
33
  - lib/regexp_crawler.rb
33
34
  - lib/regexp_crawler/crawler.rb