RubyGems - recipe_crawler - Versions diffs - 3.1.0 → 3.1.2 - Mend

recipe_crawler 3.1.0 → 3.1.2

Files changed (5) hide show

checksums.yaml +4 -4
data/bin/recipe_crawler +4 -2
data/lib/recipe_crawler/crawler.rb +34 -33
data/lib/recipe_crawler/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 93407c5d01cd73352cf65d4088cd809db0cafe52
-  data.tar.gz: 40e6ed45c7dbddcecc5263d39b0d861f2f6e176b
+  metadata.gz: 3ba3aefb304f9ab25b0939192318bde3184d9ee9
+  data.tar.gz: 7f0f73cb2c3b29498fdc5eb47d62c862970ee813
 SHA512:
-  metadata.gz: 8711f60e8a64f9283938c6691db8b2a31e53739c4f15b7cedf0aeb05ebcbdbff1effccf180df3ab5e4e34f2ecd4507809015afddaa2e3b5d04b194c8752eb35d
-  data.tar.gz: 47668977e73a246b6b5ad5f65bf59ffcaacdccbac906cf2b6f4ecfaad641fb8c3906c8903d1a929224842f795d940b5154664b032c84c35838947ff59da11794
+  metadata.gz: a4849ca0b5ee1f5d2521d0d77f1308c9c11f56f2247a45d45defc4b12fa6ed9d2cb4a36f8831162221792b9a8d2b48eb57a8bfb9d05e4bcd18953d998f17f50f
+  data.tar.gz: 3124cbe521de9c8e45ea5f11f585600f38c9399d4301de6809e08eb9f9ce4768a0cbde3bfb4cd75e1205559dc3bc7cc4a33be725d871f33328ed394f91ea98d3

data/bin/recipe_crawler CHANGED Viewed

@@ -5,13 +5,15 @@ require "optparse"
 options = {
   limit: 1,
-  url: nil
+  url: nil,
+  interval_time: 0
 }
 # Parse options
 optparser = OptionParser.new do |opts|
   opts.on("-n", "--number x", Integer, "Number of recipes to fetch") { |x| options[:number] = x}
   opts.on("-u", "--url url", String, "URL of a recipe to fetch") { |x| options[:url] = x}
+  opts.on("-i", "--interval_time x", Integer, "Wainting time between requests") { |x| options[:interval_time] = x}
   opts.parse!(ARGV)
 end
@@ -22,7 +24,7 @@ if options[:url]
   r = RecipeCrawler::Crawler.new options[:url]
   # crawl and display recipe fetched
-  r.crawl!(options[:number]) do |recipe|
+  r.crawl!(options[:number], options[:interval_time]) do |recipe|
     puts "[x] #{recipe.title} fetched!"
   end

data/lib/recipe_crawler/crawler.rb CHANGED Viewed

@@ -11,14 +11,13 @@ module RecipeCrawler
 	#   2. it will crawl urls founded to find other url again & again
 	#   3. it will scrape urls founded to get data
 	#
-	# @attr url [String] first url parsed
-	# @attr host [Symbol] of url's host
-	# @attr scraped_urls [Array<String>] of url's host
-	# @attr crawled_urls [Array<String>] of url's host
-	# @attr to_crawl_urls [Array<String>] of url's host
-	# @attr recipes [Array<RecipeSraper::Recipe>] recipes fetched
-	#
-	# @attr db [SQLite3::Database] Sqlite database where recipe will be saved
+	# @attr_reader url [String] first url parsed
+	# @attr_reader host [Symbol] of url's host
+	# @attr_reader scraped_urls [Array<String>] of url's host
+	# @attr_reader crawled_urls [Array<String>] of url's host
+	# @attr_reader to_crawl_urls [Array<String>] of url's host
+	# @attr_reader recipes [Array<RecipeSraper::Recipe>] recipes fetched
+	# @attr_reader db [SQLite3::Database] Sqlite database where recipe will be saved
 	class Crawler
 		# URL than crawler can parse
@@ -29,7 +28,7 @@ module RecipeCrawler
 		}
 		attr_reader :url, :host, :scraped_urls, :crawled_urls, :to_crawl_urls, :recipes
+		attr_accessor :interval_sleep_time
 		#
 		# Create a Crawler
@@ -42,6 +41,7 @@ module RecipeCrawler
 				@scraped_urls = []
 				@to_crawl_urls = []
 				@to_crawl_urls << url
+				@interval_sleep_time = 0
 				@db = SQLite3::Database.new "results.sqlite3"
 				@db.execute "CREATE TABLE IF NOT EXISTS recipes(
 					Id INTEGER PRIMARY KEY,
@@ -75,31 +75,27 @@ module RecipeCrawler
 		#
 		# Start the crawl
-		# @param limit [Integer]
+		# @param limit [Integer] the maximum number of scraped recipes
+		# @param interval_sleep_time [Integer] waiting time between scraping
 		#
 		# @yield [RecipeSraper::Recipe] as recipe scraped
-		def crawl! limit=2
-			# find all link on url given (and urls of theses)
+		def crawl! limit=2, interval_sleep_time=0
+			recipes_returned = 0
 			if @host == :cuisineaz
-				while !@to_crawl_urls.empty?
-					get_links to_crawl_urls[0]
-					break if @crawled_urls.count > limit
+				while !@to_crawl_urls.empty? and limit > @recipes.count
+					# find all link on url given (and urls of theses)
+					get_links @to_crawl_urls[0]
+					# now scrape an url
+					recipe = scrape @to_crawl_urls[0]
+					yield recipe if recipe and block_given?
+					sleep interval_sleep_time
 				end
 			else
 				raise NotImplementedError
 			end
-			# scrap urls
-			recipes_returned = 0
-			@crawled_urls.each{ |crawled_url|
-				if limit > recipes_returned
-					yield scrape crawled_url
-					recipes_returned += 1
-				else
-					break
-				end
-			} if block_given?
 		end
@@ -108,14 +104,19 @@ module RecipeCrawler
 		# param url [String] as url to scrape
 		#
 		# @return [RecipeSraper::Recipe] as recipe scraped
+		# @return [nil] if recipe connat be fetched
 		def scrape url
-			recipe = RecipeSraper::Recipe.new url
-			@scraped_urls << url
-			@recipes << recipe
-			if save recipe
-				return recipe
-			else
-				raise SQLite3::Exception, 'accnot save recipe'
+			begin
+				recipe = RecipeSraper::Recipe.new url
+				@scraped_urls << url
+				@recipes << recipe
+				if save recipe
+					return recipe
+				else
+					raise SQLite3::Exception, 'cannot save recipe'
+				end
+			rescue OpenURI::HTTPError
+				return nil
 			end
 		end

data/lib/recipe_crawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module RecipeCrawler
-  VERSION = "3.1.0"
+  VERSION = "3.1.2"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: recipe_crawler
 version: !ruby/object:Gem::Version
-  version: 3.1.0
+  version: 3.1.2
 platform: ruby
 authors:
 - madeindjs