RubyGems - flyerhzm-regexp_crawler - Versions diffs - 0.8.1 → 0.8.2 - Mend

flyerhzm-regexp_crawler 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.textile +5 -5
data/VERSION +1 -1
data/example/github_projects.rb +19 -0
data/lib/regexp_crawler/crawler.rb +2 -2
data/regexp_crawler.gemspec +3 -2
data/spec/regexp_crawler_spec.rb +1 -1
metadata +3 -2

data/README.textile CHANGED Viewed

@@ -1,6 +1,6 @@
 h1. RegexpCrawler
-RegexpCrawler is a crawler which uses regrex expression to catch data from website. It is easy to use and less code if you are familiar with regrex expression.
+RegexpCrawler is a crawler which uses regular expression to catch data from website. It is easy to use and less code if you are familiar with regular expression.
 **************************************************************************
@@ -26,8 +26,8 @@ options is a hash
 * <code>:continue_regexp</code>, optional, a regexp to define what website urls the crawler continue to crawl, it is parsed by String#scan and get the first not nil result
 * <code>:capture_regexp</code>, mandatory, a regexp to define what contents the crawler crawl, it is parse by Regexp#match and get all group captures
 * <code>:named_captures</code>, mandatory, a string array to define the names of captured groups according to :capture_regexp
-* <code>:model</code>, :optional if :save_method defined, a string of result's model class
-* <code>:save_method</code>, :optional if :model defined, a proc to define how to save the result which the crawler crawled, the proc accept two parameters, first is one page crawled result, second is the crawled url
+* <code>:model</code>, optional if :save_method defined, a string of result's model class
+* <code>:save_method</code>, optional if :model defined, a proc to define how to save the result which the crawler crawled, the proc accept two parameters, first is one page crawled result, second is the crawled url
 * <code>:headers</code>, optional, a hash to define http headers
 * <code>:encoding</code>, optional, a string of the coding of crawled page, the results will be converted to utf8
 * <code>:need_parse</code>, optional, a proc if parsing the page by regexp or not, the proc accept two parameters, first is the crawled website uri, second is the response body of crawled page
@@ -76,9 +76,9 @@ A rails plugin/gem to kill N+1 queries and unused eager loading
 =============================
 http://github.com/flyerhzm/regexp_crawler/tree/master
 regexp_crawler
-A crawler which use regrex expression to catch data.
+A crawler which use regular expression to catch data.
 <div class="wikistyle"><h1>RegexpCrawler</h1>
-<p>RegexpCrawler is a crawler which use regrex expressi...
+<p>RegexpCrawler is a crawler which use regex expressi...
 =============================
 http://github.com/flyerhzm/sitemap/tree/master
 sitemap

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.8.1
1	+ 0.8.2

data/example/github_projects.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'rubygems'
+require 'regexp_crawler'
+crawler = RegexpCrawler::Crawler.new(
+  :start_page => "http://github.com/flyerhzm",
+  :continue_regexp => %r{<div class="title"><b><a href="(/flyerhzm/.*?)">}m,
+  :capture_regexp => %r{<a href="http://github.com/flyerhzm/[^/"]*?(?:/tree)?">(.*?)</a>.*<span id="repository_description".*?>(.*?)</span>.*(<div class="(?:wikistyle|plain)">.*?</div>)</div>}m,
+  :named_captures => ['title', 'description', 'body'],
+  :save_method => Proc.new do |result, page|
+    puts '============================='
+    puts page
+    puts result[:title]
+    puts result[:description]
+    puts result[:body][0..100] + "..."
+  end,
+  :need_parse => Proc.new do |page, response_body|
+    page =~ %r{http://github.com/flyerhzm/\w+} && !response_body.index(/Fork of.*?<a href=".*?">/)
+  end)
+crawler.start

data/lib/regexp_crawler/crawler.rb CHANGED Viewed

@@ -37,7 +37,7 @@ module RegexpCrawler
       end
       def continue_uri(uri, page)
-        if page.start_with?(uri.scheme)
+        if page =~ /^#{uri.scheme}/
           URI.parse(page)
         elsif page.start_with?('/')
           URI.join(uri.scheme + '://' + uri.host, page)
@@ -56,7 +56,7 @@ module RegexpCrawler
               @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
             end
           end
-          if @need_parse.nil? or @need_parse.call(uri.to_i, response_body)
+          if @need_parse.nil? or @need_parse.call(uri.to_s, response_body)
             md = @capture_regexp.match(response_body)
             if md
               captures = md.captures

data/regexp_crawler.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{regexp_crawler}
-  s.version = "0.8.1"
+  s.version = "0.8.2"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Richard Huang"]
-  s.date = %q{2009-09-12}
+  s.date = %q{2009-09-13}
   s.description = %q{RegexpCrawler is a Ruby library for crawl data from website using regular expression.}
   s.email = %q{flyerhzm@gmail.com}
   s.extra_rdoc_files = [
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
      "Rakefile",
      "TODO",
      "VERSION",
+     "example/github_projects.rb",
      "init.rb",
      "lib/regexp_crawler.rb",
      "lib/regexp_crawler/crawler.rb",

data/spec/regexp_crawler_spec.rb CHANGED Viewed

@@ -91,7 +91,7 @@ describe RegexpCrawler::Crawler do
       crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
       crawl.named_captures = ['title', 'date', 'body']
       crawl.model = 'post'
-      crawl.need_parse = Proc.new do |uri, response_body|
+      crawl.need_parse = Proc.new do |page, response_body|
         if response_body.index('nested2 test html')
           false
         else

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: flyerhzm-regexp_crawler
 version: !ruby/object:Gem::Version
-  version: 0.8.1
+  version: 0.8.2
 platform: ruby
 authors:
 - Richard Huang
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-09-12 00:00:00 -07:00
+date: 2009-09-13 00:00:00 -07:00
 default_executable:
 dependencies: []
@@ -28,6 +28,7 @@ files:
 - Rakefile
 - TODO
 - VERSION
+- example/github_projects.rb
 - init.rb
 - lib/regexp_crawler.rb
 - lib/regexp_crawler/crawler.rb