recipe_crawler 3.1.0 → 3.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/recipe_crawler +4 -2
- data/lib/recipe_crawler/crawler.rb +34 -33
- data/lib/recipe_crawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ba3aefb304f9ab25b0939192318bde3184d9ee9
|
4
|
+
data.tar.gz: 7f0f73cb2c3b29498fdc5eb47d62c862970ee813
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4849ca0b5ee1f5d2521d0d77f1308c9c11f56f2247a45d45defc4b12fa6ed9d2cb4a36f8831162221792b9a8d2b48eb57a8bfb9d05e4bcd18953d998f17f50f
|
7
|
+
data.tar.gz: 3124cbe521de9c8e45ea5f11f585600f38c9399d4301de6809e08eb9f9ce4768a0cbde3bfb4cd75e1205559dc3bc7cc4a33be725d871f33328ed394f91ea98d3
|
data/bin/recipe_crawler
CHANGED
@@ -5,13 +5,15 @@ require "optparse"
|
|
5
5
|
|
6
6
|
options = {
|
7
7
|
limit: 1,
|
8
|
-
url: nil
|
8
|
+
url: nil,
|
9
|
+
interval_time: 0
|
9
10
|
}
|
10
11
|
|
11
12
|
# Parse options
|
12
13
|
optparser = OptionParser.new do |opts|
|
13
14
|
opts.on("-n", "--number x", Integer, "Number of recipes to fetch") { |x| options[:number] = x}
|
14
15
|
opts.on("-u", "--url url", String, "URL of a recipe to fetch") { |x| options[:url] = x}
|
16
|
+
opts.on("-i", "--interval_time x", Integer, "Wainting time between requests") { |x| options[:interval_time] = x}
|
15
17
|
opts.parse!(ARGV)
|
16
18
|
end
|
17
19
|
|
@@ -22,7 +24,7 @@ if options[:url]
|
|
22
24
|
r = RecipeCrawler::Crawler.new options[:url]
|
23
25
|
|
24
26
|
# crawl and display recipe fetched
|
25
|
-
r.crawl!(options[:number]) do |recipe|
|
27
|
+
r.crawl!(options[:number], options[:interval_time]) do |recipe|
|
26
28
|
puts "[x] #{recipe.title} fetched!"
|
27
29
|
end
|
28
30
|
|
@@ -11,14 +11,13 @@ module RecipeCrawler
|
|
11
11
|
# 2. it will crawl urls founded to find other url again & again
|
12
12
|
# 3. it will scrape urls founded to get data
|
13
13
|
#
|
14
|
-
# @
|
15
|
-
# @
|
16
|
-
# @
|
17
|
-
# @
|
18
|
-
# @
|
19
|
-
# @
|
20
|
-
#
|
21
|
-
# @attr db [SQLite3::Database] Sqlite database where recipe will be saved
|
14
|
+
# @attr_reader url [String] first url parsed
|
15
|
+
# @attr_reader host [Symbol] of url's host
|
16
|
+
# @attr_reader scraped_urls [Array<String>] of url's host
|
17
|
+
# @attr_reader crawled_urls [Array<String>] of url's host
|
18
|
+
# @attr_reader to_crawl_urls [Array<String>] of url's host
|
19
|
+
# @attr_reader recipes [Array<RecipeSraper::Recipe>] recipes fetched
|
20
|
+
# @attr_reader db [SQLite3::Database] Sqlite database where recipe will be saved
|
22
21
|
class Crawler
|
23
22
|
|
24
23
|
# URL than crawler can parse
|
@@ -29,7 +28,7 @@ module RecipeCrawler
|
|
29
28
|
}
|
30
29
|
|
31
30
|
attr_reader :url, :host, :scraped_urls, :crawled_urls, :to_crawl_urls, :recipes
|
32
|
-
|
31
|
+
attr_accessor :interval_sleep_time
|
33
32
|
|
34
33
|
#
|
35
34
|
# Create a Crawler
|
@@ -42,6 +41,7 @@ module RecipeCrawler
|
|
42
41
|
@scraped_urls = []
|
43
42
|
@to_crawl_urls = []
|
44
43
|
@to_crawl_urls << url
|
44
|
+
@interval_sleep_time = 0
|
45
45
|
@db = SQLite3::Database.new "results.sqlite3"
|
46
46
|
@db.execute "CREATE TABLE IF NOT EXISTS recipes(
|
47
47
|
Id INTEGER PRIMARY KEY,
|
@@ -75,31 +75,27 @@ module RecipeCrawler
|
|
75
75
|
|
76
76
|
#
|
77
77
|
# Start the crawl
|
78
|
-
# @param limit [Integer]
|
78
|
+
# @param limit [Integer] the maximum number of scraped recipes
|
79
|
+
# @param interval_sleep_time [Integer] waiting time between scraping
|
79
80
|
#
|
80
81
|
# @yield [RecipeSraper::Recipe] as recipe scraped
|
81
|
-
def crawl! limit=2
|
82
|
-
|
82
|
+
def crawl! limit=2, interval_sleep_time=0
|
83
|
+
recipes_returned = 0
|
84
|
+
|
83
85
|
if @host == :cuisineaz
|
84
|
-
|
85
|
-
|
86
|
-
|
86
|
+
|
87
|
+
while !@to_crawl_urls.empty? and limit > @recipes.count
|
88
|
+
# find all link on url given (and urls of theses)
|
89
|
+
get_links @to_crawl_urls[0]
|
90
|
+
# now scrape an url
|
91
|
+
recipe = scrape @to_crawl_urls[0]
|
92
|
+
yield recipe if recipe and block_given?
|
93
|
+
sleep interval_sleep_time
|
87
94
|
end
|
88
95
|
|
89
96
|
else
|
90
97
|
raise NotImplementedError
|
91
98
|
end
|
92
|
-
|
93
|
-
# scrap urls
|
94
|
-
recipes_returned = 0
|
95
|
-
@crawled_urls.each{ |crawled_url|
|
96
|
-
if limit > recipes_returned
|
97
|
-
yield scrape crawled_url
|
98
|
-
recipes_returned += 1
|
99
|
-
else
|
100
|
-
break
|
101
|
-
end
|
102
|
-
} if block_given?
|
103
99
|
end
|
104
100
|
|
105
101
|
|
@@ -108,14 +104,19 @@ module RecipeCrawler
|
|
108
104
|
# param url [String] as url to scrape
|
109
105
|
#
|
110
106
|
# @return [RecipeSraper::Recipe] as recipe scraped
|
107
|
+
# @return [nil] if recipe connat be fetched
|
111
108
|
def scrape url
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
109
|
+
begin
|
110
|
+
recipe = RecipeSraper::Recipe.new url
|
111
|
+
@scraped_urls << url
|
112
|
+
@recipes << recipe
|
113
|
+
if save recipe
|
114
|
+
return recipe
|
115
|
+
else
|
116
|
+
raise SQLite3::Exception, 'cannot save recipe'
|
117
|
+
end
|
118
|
+
rescue OpenURI::HTTPError
|
119
|
+
return nil
|
119
120
|
end
|
120
121
|
end
|
121
122
|
|