recipe_crawler 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 93407c5d01cd73352cf65d4088cd809db0cafe52
4
- data.tar.gz: 40e6ed45c7dbddcecc5263d39b0d861f2f6e176b
3
+ metadata.gz: 3ba3aefb304f9ab25b0939192318bde3184d9ee9
4
+ data.tar.gz: 7f0f73cb2c3b29498fdc5eb47d62c862970ee813
5
5
  SHA512:
6
- metadata.gz: 8711f60e8a64f9283938c6691db8b2a31e53739c4f15b7cedf0aeb05ebcbdbff1effccf180df3ab5e4e34f2ecd4507809015afddaa2e3b5d04b194c8752eb35d
7
- data.tar.gz: 47668977e73a246b6b5ad5f65bf59ffcaacdccbac906cf2b6f4ecfaad641fb8c3906c8903d1a929224842f795d940b5154664b032c84c35838947ff59da11794
6
+ metadata.gz: a4849ca0b5ee1f5d2521d0d77f1308c9c11f56f2247a45d45defc4b12fa6ed9d2cb4a36f8831162221792b9a8d2b48eb57a8bfb9d05e4bcd18953d998f17f50f
7
+ data.tar.gz: 3124cbe521de9c8e45ea5f11f585600f38c9399d4301de6809e08eb9f9ce4768a0cbde3bfb4cd75e1205559dc3bc7cc4a33be725d871f33328ed394f91ea98d3
data/bin/recipe_crawler CHANGED
@@ -5,13 +5,15 @@ require "optparse"
5
5
 
6
6
  options = {
7
7
  limit: 1,
8
- url: nil
8
+ url: nil,
9
+ interval_time: 0
9
10
  }
10
11
 
11
12
  # Parse options
12
13
  optparser = OptionParser.new do |opts|
13
14
  opts.on("-n", "--number x", Integer, "Number of recipes to fetch") { |x| options[:number] = x}
14
15
  opts.on("-u", "--url url", String, "URL of a recipe to fetch") { |x| options[:url] = x}
16
+ opts.on("-i", "--interval_time x", Integer, "Wainting time between requests") { |x| options[:interval_time] = x}
15
17
  opts.parse!(ARGV)
16
18
  end
17
19
 
@@ -22,7 +24,7 @@ if options[:url]
22
24
  r = RecipeCrawler::Crawler.new options[:url]
23
25
 
24
26
  # crawl and display recipe fetched
25
- r.crawl!(options[:number]) do |recipe|
27
+ r.crawl!(options[:number], options[:interval_time]) do |recipe|
26
28
  puts "[x] #{recipe.title} fetched!"
27
29
  end
28
30
 
@@ -11,14 +11,13 @@ module RecipeCrawler
11
11
  # 2. it will crawl urls founded to find other url again & again
12
12
  # 3. it will scrape urls founded to get data
13
13
  #
14
- # @attr url [String] first url parsed
15
- # @attr host [Symbol] of url's host
16
- # @attr scraped_urls [Array<String>] of url's host
17
- # @attr crawled_urls [Array<String>] of url's host
18
- # @attr to_crawl_urls [Array<String>] of url's host
19
- # @attr recipes [Array<RecipeSraper::Recipe>] recipes fetched
20
- #
21
- # @attr db [SQLite3::Database] Sqlite database where recipe will be saved
14
+ # @attr_reader url [String] first url parsed
15
+ # @attr_reader host [Symbol] of url's host
16
+ # @attr_reader scraped_urls [Array<String>] of url's host
17
+ # @attr_reader crawled_urls [Array<String>] of url's host
18
+ # @attr_reader to_crawl_urls [Array<String>] of url's host
19
+ # @attr_reader recipes [Array<RecipeSraper::Recipe>] recipes fetched
20
+ # @attr_reader db [SQLite3::Database] Sqlite database where recipe will be saved
22
21
  class Crawler
23
22
 
24
23
  # URL than crawler can parse
@@ -29,7 +28,7 @@ module RecipeCrawler
29
28
  }
30
29
 
31
30
  attr_reader :url, :host, :scraped_urls, :crawled_urls, :to_crawl_urls, :recipes
32
-
31
+ attr_accessor :interval_sleep_time
33
32
 
34
33
  #
35
34
  # Create a Crawler
@@ -42,6 +41,7 @@ module RecipeCrawler
42
41
  @scraped_urls = []
43
42
  @to_crawl_urls = []
44
43
  @to_crawl_urls << url
44
+ @interval_sleep_time = 0
45
45
  @db = SQLite3::Database.new "results.sqlite3"
46
46
  @db.execute "CREATE TABLE IF NOT EXISTS recipes(
47
47
  Id INTEGER PRIMARY KEY,
@@ -75,31 +75,27 @@ module RecipeCrawler
75
75
 
76
76
  #
77
77
  # Start the crawl
78
- # @param limit [Integer]
78
+ # @param limit [Integer] the maximum number of scraped recipes
79
+ # @param interval_sleep_time [Integer] waiting time between scraping
79
80
  #
80
81
  # @yield [RecipeSraper::Recipe] as recipe scraped
81
- def crawl! limit=2
82
- # find all link on url given (and urls of theses)
82
+ def crawl! limit=2, interval_sleep_time=0
83
+ recipes_returned = 0
84
+
83
85
  if @host == :cuisineaz
84
- while !@to_crawl_urls.empty?
85
- get_links to_crawl_urls[0]
86
- break if @crawled_urls.count > limit
86
+
87
+ while !@to_crawl_urls.empty? and limit > @recipes.count
88
+ # find all link on url given (and urls of theses)
89
+ get_links @to_crawl_urls[0]
90
+ # now scrape an url
91
+ recipe = scrape @to_crawl_urls[0]
92
+ yield recipe if recipe and block_given?
93
+ sleep interval_sleep_time
87
94
  end
88
95
 
89
96
  else
90
97
  raise NotImplementedError
91
98
  end
92
-
93
- # scrap urls
94
- recipes_returned = 0
95
- @crawled_urls.each{ |crawled_url|
96
- if limit > recipes_returned
97
- yield scrape crawled_url
98
- recipes_returned += 1
99
- else
100
- break
101
- end
102
- } if block_given?
103
99
  end
104
100
 
105
101
 
@@ -108,14 +104,19 @@ module RecipeCrawler
108
104
  # param url [String] as url to scrape
109
105
  #
110
106
  # @return [RecipeSraper::Recipe] as recipe scraped
107
+ # @return [nil] if recipe connat be fetched
111
108
  def scrape url
112
- recipe = RecipeSraper::Recipe.new url
113
- @scraped_urls << url
114
- @recipes << recipe
115
- if save recipe
116
- return recipe
117
- else
118
- raise SQLite3::Exception, 'accnot save recipe'
109
+ begin
110
+ recipe = RecipeSraper::Recipe.new url
111
+ @scraped_urls << url
112
+ @recipes << recipe
113
+ if save recipe
114
+ return recipe
115
+ else
116
+ raise SQLite3::Exception, 'cannot save recipe'
117
+ end
118
+ rescue OpenURI::HTTPError
119
+ return nil
119
120
  end
120
121
  end
121
122
 
@@ -1,3 +1,3 @@
1
1
  module RecipeCrawler
2
- VERSION = "3.1.0"
2
+ VERSION = "3.1.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: recipe_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - madeindjs