RubyGems - recipe_crawler - Versions diffs - 3.1.0 → 3.1.2 - Mend

recipe_crawler 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/bin/recipe_crawler +4 -2
data/lib/recipe_crawler/crawler.rb +34 -33
data/lib/recipe_crawler/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 93407c5d01cd73352cf65d4088cd809db0cafe52
-  data.tar.gz: 40e6ed45c7dbddcecc5263d39b0d861f2f6e176b
+  metadata.gz: 3ba3aefb304f9ab25b0939192318bde3184d9ee9
+  data.tar.gz: 7f0f73cb2c3b29498fdc5eb47d62c862970ee813
 SHA512:
-  metadata.gz: 8711f60e8a64f9283938c6691db8b2a31e53739c4f15b7cedf0aeb05ebcbdbff1effccf180df3ab5e4e34f2ecd4507809015afddaa2e3b5d04b194c8752eb35d
-  data.tar.gz: 47668977e73a246b6b5ad5f65bf59ffcaacdccbac906cf2b6f4ecfaad641fb8c3906c8903d1a929224842f795d940b5154664b032c84c35838947ff59da11794
+  metadata.gz: a4849ca0b5ee1f5d2521d0d77f1308c9c11f56f2247a45d45defc4b12fa6ed9d2cb4a36f8831162221792b9a8d2b48eb57a8bfb9d05e4bcd18953d998f17f50f
+  data.tar.gz: 3124cbe521de9c8e45ea5f11f585600f38c9399d4301de6809e08eb9f9ce4768a0cbde3bfb4cd75e1205559dc3bc7cc4a33be725d871f33328ed394f91ea98d3

data/bin/recipe_crawler CHANGED Viewed

@@ -5,13 +5,15 @@ require "optparse"
 options = {
   limit: 1,
-  url: nil
+  url: nil,
+  interval_time: 0
 }
 # Parse options
 optparser = OptionParser.new do |opts|
   opts.on("-n", "--number x", Integer, "Number of recipes to fetch") { |x| options[:number] = x}
   opts.on("-u", "--url url", String, "URL of a recipe to fetch") { |x| options[:url] = x}
+  opts.on("-i", "--interval_time x", Integer, "Wainting time between requests") { |x| options[:interval_time] = x}
   opts.parse!(ARGV)
 end
@@ -22,7 +24,7 @@ if options[:url]
   r = RecipeCrawler::Crawler.new options[:url]
   # crawl and display recipe fetched
-  r.crawl!(options[:number]) do |recipe|
+  r.crawl!(options[:number], options[:interval_time]) do |recipe|
     puts "[x] #{recipe.title} fetched!"
   end

data/lib/recipe_crawler/crawler.rb CHANGED Viewed

@@ -11,14 +11,13 @@ module RecipeCrawler
 	#   2. it will crawl urls founded to find other url again & again
 	#   3. it will scrape urls founded to get data
 	#
-	# @attr url [String] first url parsed
-	# @attr host [Symbol] of url's host
-	# @attr scraped_urls [Array<String>] of url's host
-	# @attr crawled_urls [Array<String>] of url's host
-	# @attr to_crawl_urls [Array<String>] of url's host
-	# @attr recipes [Array<RecipeSraper::Recipe>] recipes fetched
-	#
-	# @attr db [SQLite3::Database] Sqlite database where recipe will be saved
+	# @attr_reader url [String] first url parsed
+	# @attr_reader host [Symbol] of url's host
+	# @attr_reader scraped_urls [Array<String>] of url's host
+	# @attr_reader crawled_urls [Array<String>] of url's host
+	# @attr_reader to_crawl_urls [Array<String>] of url's host
+	# @attr_reader recipes [Array<RecipeSraper::Recipe>] recipes fetched
+	# @attr_reader db [SQLite3::Database] Sqlite database where recipe will be saved
 	class Crawler
 		# URL than crawler can parse
@@ -29,7 +28,7 @@ module RecipeCrawler
 		}
 		attr_reader :url, :host, :scraped_urls, :crawled_urls, :to_crawl_urls, :recipes
+		attr_accessor :interval_sleep_time
 		#
 		# Create a Crawler
@@ -42,6 +41,7 @@ module RecipeCrawler
 				@scraped_urls = []
 				@to_crawl_urls = []
 				@to_crawl_urls << url
+				@interval_sleep_time = 0
 				@db = SQLite3::Database.new "results.sqlite3"
 				@db.execute "CREATE TABLE IF NOT EXISTS recipes(
 					Id INTEGER PRIMARY KEY,
@@ -75,31 +75,27 @@ module RecipeCrawler
 		#
 		# Start the crawl
-		# @param limit [Integer]
+		# @param limit [Integer] the maximum number of scraped recipes
+		# @param interval_sleep_time [Integer] waiting time between scraping
 		#
 		# @yield [RecipeSraper::Recipe] as recipe scraped
-		def crawl! limit=2
-			# find all link on url given (and urls of theses)
+		def crawl! limit=2, interval_sleep_time=0
+			recipes_returned = 0
 			if @host == :cuisineaz
-				while !@to_crawl_urls.empty?
-					get_links to_crawl_urls[0]
-					break if @crawled_urls.count > limit
+				while !@to_crawl_urls.empty? and limit > @recipes.count
+					# find all link on url given (and urls of theses)
+					get_links @to_crawl_urls[0]
+					# now scrape an url
+					recipe = scrape @to_crawl_urls[0]
+					yield recipe if recipe and block_given?
+					sleep interval_sleep_time
 				end
 			else
 				raise NotImplementedError
 			end
-			# scrap urls
-			recipes_returned = 0
-			@crawled_urls.each{ |crawled_url|
-				if limit > recipes_returned
-					yield scrape crawled_url
-					recipes_returned += 1
-				else
-					break
-				end
-			} if block_given?
 		end
@@ -108,14 +104,19 @@ module RecipeCrawler
 		# param url [String] as url to scrape
 		#
 		# @return [RecipeSraper::Recipe] as recipe scraped
+		# @return [nil] if recipe connat be fetched
 		def scrape url
-			recipe = RecipeSraper::Recipe.new url
-			@scraped_urls << url
-			@recipes << recipe
-			if save recipe
-				return recipe
-			else
-				raise SQLite3::Exception, 'accnot save recipe'
+			begin
+				recipe = RecipeSraper::Recipe.new url
+				@scraped_urls << url
+				@recipes << recipe
+				if save recipe
+					return recipe
+				else
+					raise SQLite3::Exception, 'cannot save recipe'
+				end
+			rescue OpenURI::HTTPError
+				return nil
 			end
 		end

data/lib/recipe_crawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module RecipeCrawler
-  VERSION = "3.1.0"
+  VERSION = "3.1.2"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: recipe_crawler
 version: !ruby/object:Gem::Version
-  version: 3.1.0
+  version: 3.1.2
 platform: ruby
 authors:
 - madeindjs