recipe_crawler 3.1.0 → 3.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 93407c5d01cd73352cf65d4088cd809db0cafe52
4
- data.tar.gz: 40e6ed45c7dbddcecc5263d39b0d861f2f6e176b
3
+ metadata.gz: 3ba3aefb304f9ab25b0939192318bde3184d9ee9
4
+ data.tar.gz: 7f0f73cb2c3b29498fdc5eb47d62c862970ee813
5
5
  SHA512:
6
- metadata.gz: 8711f60e8a64f9283938c6691db8b2a31e53739c4f15b7cedf0aeb05ebcbdbff1effccf180df3ab5e4e34f2ecd4507809015afddaa2e3b5d04b194c8752eb35d
7
- data.tar.gz: 47668977e73a246b6b5ad5f65bf59ffcaacdccbac906cf2b6f4ecfaad641fb8c3906c8903d1a929224842f795d940b5154664b032c84c35838947ff59da11794
6
+ metadata.gz: a4849ca0b5ee1f5d2521d0d77f1308c9c11f56f2247a45d45defc4b12fa6ed9d2cb4a36f8831162221792b9a8d2b48eb57a8bfb9d05e4bcd18953d998f17f50f
7
+ data.tar.gz: 3124cbe521de9c8e45ea5f11f585600f38c9399d4301de6809e08eb9f9ce4768a0cbde3bfb4cd75e1205559dc3bc7cc4a33be725d871f33328ed394f91ea98d3
data/bin/recipe_crawler CHANGED
@@ -5,13 +5,15 @@ require "optparse"
5
5
 
6
6
  options = {
7
7
  limit: 1,
8
- url: nil
8
+ url: nil,
9
+ interval_time: 0
9
10
  }
10
11
 
11
12
  # Parse options
12
13
  optparser = OptionParser.new do |opts|
13
14
  opts.on("-n", "--number x", Integer, "Number of recipes to fetch") { |x| options[:number] = x}
14
15
  opts.on("-u", "--url url", String, "URL of a recipe to fetch") { |x| options[:url] = x}
16
+ opts.on("-i", "--interval_time x", Integer, "Wainting time between requests") { |x| options[:interval_time] = x}
15
17
  opts.parse!(ARGV)
16
18
  end
17
19
 
@@ -22,7 +24,7 @@ if options[:url]
22
24
  r = RecipeCrawler::Crawler.new options[:url]
23
25
 
24
26
  # crawl and display recipe fetched
25
- r.crawl!(options[:number]) do |recipe|
27
+ r.crawl!(options[:number], options[:interval_time]) do |recipe|
26
28
  puts "[x] #{recipe.title} fetched!"
27
29
  end
28
30
 
@@ -11,14 +11,13 @@ module RecipeCrawler
11
11
  # 2. it will crawl urls founded to find other url again & again
12
12
  # 3. it will scrape urls founded to get data
13
13
  #
14
- # @attr url [String] first url parsed
15
- # @attr host [Symbol] of url's host
16
- # @attr scraped_urls [Array<String>] of url's host
17
- # @attr crawled_urls [Array<String>] of url's host
18
- # @attr to_crawl_urls [Array<String>] of url's host
19
- # @attr recipes [Array<RecipeSraper::Recipe>] recipes fetched
20
- #
21
- # @attr db [SQLite3::Database] Sqlite database where recipe will be saved
14
+ # @attr_reader url [String] first url parsed
15
+ # @attr_reader host [Symbol] of url's host
16
+ # @attr_reader scraped_urls [Array<String>] of url's host
17
+ # @attr_reader crawled_urls [Array<String>] of url's host
18
+ # @attr_reader to_crawl_urls [Array<String>] of url's host
19
+ # @attr_reader recipes [Array<RecipeSraper::Recipe>] recipes fetched
20
+ # @attr_reader db [SQLite3::Database] Sqlite database where recipe will be saved
22
21
  class Crawler
23
22
 
24
23
  # URL than crawler can parse
@@ -29,7 +28,7 @@ module RecipeCrawler
29
28
  }
30
29
 
31
30
  attr_reader :url, :host, :scraped_urls, :crawled_urls, :to_crawl_urls, :recipes
32
-
31
+ attr_accessor :interval_sleep_time
33
32
 
34
33
  #
35
34
  # Create a Crawler
@@ -42,6 +41,7 @@ module RecipeCrawler
42
41
  @scraped_urls = []
43
42
  @to_crawl_urls = []
44
43
  @to_crawl_urls << url
44
+ @interval_sleep_time = 0
45
45
  @db = SQLite3::Database.new "results.sqlite3"
46
46
  @db.execute "CREATE TABLE IF NOT EXISTS recipes(
47
47
  Id INTEGER PRIMARY KEY,
@@ -75,31 +75,27 @@ module RecipeCrawler
75
75
 
76
76
  #
77
77
  # Start the crawl
78
- # @param limit [Integer]
78
+ # @param limit [Integer] the maximum number of scraped recipes
79
+ # @param interval_sleep_time [Integer] waiting time between scraping
79
80
  #
80
81
  # @yield [RecipeSraper::Recipe] as recipe scraped
81
- def crawl! limit=2
82
- # find all link on url given (and urls of theses)
82
+ def crawl! limit=2, interval_sleep_time=0
83
+ recipes_returned = 0
84
+
83
85
  if @host == :cuisineaz
84
- while !@to_crawl_urls.empty?
85
- get_links to_crawl_urls[0]
86
- break if @crawled_urls.count > limit
86
+
87
+ while !@to_crawl_urls.empty? and limit > @recipes.count
88
+ # find all link on url given (and urls of theses)
89
+ get_links @to_crawl_urls[0]
90
+ # now scrape an url
91
+ recipe = scrape @to_crawl_urls[0]
92
+ yield recipe if recipe and block_given?
93
+ sleep interval_sleep_time
87
94
  end
88
95
 
89
96
  else
90
97
  raise NotImplementedError
91
98
  end
92
-
93
- # scrap urls
94
- recipes_returned = 0
95
- @crawled_urls.each{ |crawled_url|
96
- if limit > recipes_returned
97
- yield scrape crawled_url
98
- recipes_returned += 1
99
- else
100
- break
101
- end
102
- } if block_given?
103
99
  end
104
100
 
105
101
 
@@ -108,14 +104,19 @@ module RecipeCrawler
108
104
  # param url [String] as url to scrape
109
105
  #
110
106
  # @return [RecipeSraper::Recipe] as recipe scraped
107
+ # @return [nil] if recipe connat be fetched
111
108
  def scrape url
112
- recipe = RecipeSraper::Recipe.new url
113
- @scraped_urls << url
114
- @recipes << recipe
115
- if save recipe
116
- return recipe
117
- else
118
- raise SQLite3::Exception, 'accnot save recipe'
109
+ begin
110
+ recipe = RecipeSraper::Recipe.new url
111
+ @scraped_urls << url
112
+ @recipes << recipe
113
+ if save recipe
114
+ return recipe
115
+ else
116
+ raise SQLite3::Exception, 'cannot save recipe'
117
+ end
118
+ rescue OpenURI::HTTPError
119
+ return nil
119
120
  end
120
121
  end
121
122
 
@@ -1,3 +1,3 @@
1
1
  module RecipeCrawler
2
- VERSION = "3.1.0"
2
+ VERSION = "3.1.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: recipe_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - madeindjs