recipe_crawler 3.1.0 → 3.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/recipe_crawler +4 -2
- data/lib/recipe_crawler/crawler.rb +34 -33
- data/lib/recipe_crawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ba3aefb304f9ab25b0939192318bde3184d9ee9
|
4
|
+
data.tar.gz: 7f0f73cb2c3b29498fdc5eb47d62c862970ee813
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4849ca0b5ee1f5d2521d0d77f1308c9c11f56f2247a45d45defc4b12fa6ed9d2cb4a36f8831162221792b9a8d2b48eb57a8bfb9d05e4bcd18953d998f17f50f
|
7
|
+
data.tar.gz: 3124cbe521de9c8e45ea5f11f585600f38c9399d4301de6809e08eb9f9ce4768a0cbde3bfb4cd75e1205559dc3bc7cc4a33be725d871f33328ed394f91ea98d3
|
data/bin/recipe_crawler
CHANGED
@@ -5,13 +5,15 @@ require "optparse"
|
|
5
5
|
|
6
6
|
options = {
|
7
7
|
limit: 1,
|
8
|
-
url: nil
|
8
|
+
url: nil,
|
9
|
+
interval_time: 0
|
9
10
|
}
|
10
11
|
|
11
12
|
# Parse options
|
12
13
|
optparser = OptionParser.new do |opts|
|
13
14
|
opts.on("-n", "--number x", Integer, "Number of recipes to fetch") { |x| options[:number] = x}
|
14
15
|
opts.on("-u", "--url url", String, "URL of a recipe to fetch") { |x| options[:url] = x}
|
16
|
+
opts.on("-i", "--interval_time x", Integer, "Wainting time between requests") { |x| options[:interval_time] = x}
|
15
17
|
opts.parse!(ARGV)
|
16
18
|
end
|
17
19
|
|
@@ -22,7 +24,7 @@ if options[:url]
|
|
22
24
|
r = RecipeCrawler::Crawler.new options[:url]
|
23
25
|
|
24
26
|
# crawl and display recipe fetched
|
25
|
-
r.crawl!(options[:number]) do |recipe|
|
27
|
+
r.crawl!(options[:number], options[:interval_time]) do |recipe|
|
26
28
|
puts "[x] #{recipe.title} fetched!"
|
27
29
|
end
|
28
30
|
|
@@ -11,14 +11,13 @@ module RecipeCrawler
|
|
11
11
|
# 2. it will crawl urls founded to find other url again & again
|
12
12
|
# 3. it will scrape urls founded to get data
|
13
13
|
#
|
14
|
-
# @
|
15
|
-
# @
|
16
|
-
# @
|
17
|
-
# @
|
18
|
-
# @
|
19
|
-
# @
|
20
|
-
#
|
21
|
-
# @attr db [SQLite3::Database] Sqlite database where recipe will be saved
|
14
|
+
# @attr_reader url [String] first url parsed
|
15
|
+
# @attr_reader host [Symbol] of url's host
|
16
|
+
# @attr_reader scraped_urls [Array<String>] of url's host
|
17
|
+
# @attr_reader crawled_urls [Array<String>] of url's host
|
18
|
+
# @attr_reader to_crawl_urls [Array<String>] of url's host
|
19
|
+
# @attr_reader recipes [Array<RecipeSraper::Recipe>] recipes fetched
|
20
|
+
# @attr_reader db [SQLite3::Database] Sqlite database where recipe will be saved
|
22
21
|
class Crawler
|
23
22
|
|
24
23
|
# URL than crawler can parse
|
@@ -29,7 +28,7 @@ module RecipeCrawler
|
|
29
28
|
}
|
30
29
|
|
31
30
|
attr_reader :url, :host, :scraped_urls, :crawled_urls, :to_crawl_urls, :recipes
|
32
|
-
|
31
|
+
attr_accessor :interval_sleep_time
|
33
32
|
|
34
33
|
#
|
35
34
|
# Create a Crawler
|
@@ -42,6 +41,7 @@ module RecipeCrawler
|
|
42
41
|
@scraped_urls = []
|
43
42
|
@to_crawl_urls = []
|
44
43
|
@to_crawl_urls << url
|
44
|
+
@interval_sleep_time = 0
|
45
45
|
@db = SQLite3::Database.new "results.sqlite3"
|
46
46
|
@db.execute "CREATE TABLE IF NOT EXISTS recipes(
|
47
47
|
Id INTEGER PRIMARY KEY,
|
@@ -75,31 +75,27 @@ module RecipeCrawler
|
|
75
75
|
|
76
76
|
#
|
77
77
|
# Start the crawl
|
78
|
-
# @param limit [Integer]
|
78
|
+
# @param limit [Integer] the maximum number of scraped recipes
|
79
|
+
# @param interval_sleep_time [Integer] waiting time between scraping
|
79
80
|
#
|
80
81
|
# @yield [RecipeSraper::Recipe] as recipe scraped
|
81
|
-
def crawl! limit=2
|
82
|
-
|
82
|
+
def crawl! limit=2, interval_sleep_time=0
|
83
|
+
recipes_returned = 0
|
84
|
+
|
83
85
|
if @host == :cuisineaz
|
84
|
-
|
85
|
-
|
86
|
-
|
86
|
+
|
87
|
+
while !@to_crawl_urls.empty? and limit > @recipes.count
|
88
|
+
# find all link on url given (and urls of theses)
|
89
|
+
get_links @to_crawl_urls[0]
|
90
|
+
# now scrape an url
|
91
|
+
recipe = scrape @to_crawl_urls[0]
|
92
|
+
yield recipe if recipe and block_given?
|
93
|
+
sleep interval_sleep_time
|
87
94
|
end
|
88
95
|
|
89
96
|
else
|
90
97
|
raise NotImplementedError
|
91
98
|
end
|
92
|
-
|
93
|
-
# scrap urls
|
94
|
-
recipes_returned = 0
|
95
|
-
@crawled_urls.each{ |crawled_url|
|
96
|
-
if limit > recipes_returned
|
97
|
-
yield scrape crawled_url
|
98
|
-
recipes_returned += 1
|
99
|
-
else
|
100
|
-
break
|
101
|
-
end
|
102
|
-
} if block_given?
|
103
99
|
end
|
104
100
|
|
105
101
|
|
@@ -108,14 +104,19 @@ module RecipeCrawler
|
|
108
104
|
# param url [String] as url to scrape
|
109
105
|
#
|
110
106
|
# @return [RecipeSraper::Recipe] as recipe scraped
|
107
|
+
# @return [nil] if recipe connat be fetched
|
111
108
|
def scrape url
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
109
|
+
begin
|
110
|
+
recipe = RecipeSraper::Recipe.new url
|
111
|
+
@scraped_urls << url
|
112
|
+
@recipes << recipe
|
113
|
+
if save recipe
|
114
|
+
return recipe
|
115
|
+
else
|
116
|
+
raise SQLite3::Exception, 'cannot save recipe'
|
117
|
+
end
|
118
|
+
rescue OpenURI::HTTPError
|
119
|
+
return nil
|
119
120
|
end
|
120
121
|
end
|
121
122
|
|