hangry 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -0
- data/lib/hangry/hrecipe_parser.rb +61 -0
- data/lib/hangry/recipe_parser.rb +61 -0
- data/lib/hangry/schema_org_recipe_parser.rb +70 -0
- data/lib/hangry/version.rb +1 -1
- data/lib/hangry.rb +9 -107
- data/spec/fixtures/epicurious.html +3610 -0
- data/spec/real_examples/epicurious_spec.rb +50 -0
- data/spec/real_examples/food_network_spec.rb +1 -1
- metadata +18 -11
data/README.md
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
module Hangry
|
2
|
+
class HRecipeParser < RecipeParser
|
3
|
+
|
4
|
+
def self.root_selector
|
5
|
+
'.hrecipe'
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
def node_with_class(klass)
|
11
|
+
nodes_with_class(klass).first || NullObject.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def nodes_with_class(klass)
|
15
|
+
recipe_ast.css(".#{klass}")
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse_author
|
19
|
+
clean_string node_with_class(:author).content
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_cook_time
|
23
|
+
#TODO
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_description
|
27
|
+
clean_string node_with_class(:summary).content
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse_ingredients
|
31
|
+
nodes_with_class(:ingredient).map(&:content).map { |ingredient| clean_string ingredient }
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse_instructions
|
35
|
+
clean_string node_with_class(:instructions).content
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_name
|
39
|
+
clean_string node_with_class(:fn).content
|
40
|
+
end
|
41
|
+
|
42
|
+
def parse_prep_time
|
43
|
+
#TODO
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_published_date
|
47
|
+
#TODO
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse_total_time
|
51
|
+
parse_duration node_with_class(:duration).css('.value-title').first['title']
|
52
|
+
end
|
53
|
+
|
54
|
+
def parse_yield
|
55
|
+
clean_string node_with_class(:yield).content
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Hangry
|
2
|
+
class RecipeParser
|
3
|
+
attr_reader :recipe_html
|
4
|
+
attr_accessor :recipe_ast, :recipe
|
5
|
+
|
6
|
+
def initialize(recipe_html)
|
7
|
+
@recipe_html = recipe_html
|
8
|
+
@recipe = Recipe.new
|
9
|
+
doc = Nokogiri::HTML(recipe_html)
|
10
|
+
self.recipe_ast = doc.css(self.class.root_selector).first
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse
|
14
|
+
RECIPE_ATTRIBUTES.each do |attribute|
|
15
|
+
attr_value = value(send("parse_#{attribute}"))
|
16
|
+
recipe.public_send("#{attribute}=", attr_value)
|
17
|
+
end
|
18
|
+
recipe
|
19
|
+
end
|
20
|
+
|
21
|
+
def can_parse?
|
22
|
+
recipe_ast
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
class NullObject
|
28
|
+
def method_missing(*args, &block)
|
29
|
+
self
|
30
|
+
end
|
31
|
+
def blank?; true; end
|
32
|
+
def present?; false; end
|
33
|
+
def to_a; []; end
|
34
|
+
def to_ary; []; end
|
35
|
+
def to_s; ""; end
|
36
|
+
def to_str; ""; end
|
37
|
+
def to_f; 0.0; end
|
38
|
+
def to_i; 0; end
|
39
|
+
end
|
40
|
+
|
41
|
+
def value(object)
|
42
|
+
case object
|
43
|
+
when NullObject then nil
|
44
|
+
else object
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def clean_string(string)
|
49
|
+
string.strip.gsub(/\s+/, ' ')
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_duration(iso8601_string)
|
53
|
+
duration = ISO8601::Duration.new(iso8601_string)
|
54
|
+
duration.hours.to_i * 60 + duration.minutes.to_i
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Hangry
|
2
|
+
class SchemaOrgRecipeParser < RecipeParser
|
3
|
+
|
4
|
+
def self.root_selector
|
5
|
+
'[itemtype="http://schema.org/Recipe"]'
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
def node_with_itemprop(itemprop)
|
11
|
+
nodes_with_itemprop(itemprop).first || NullObject.new
|
12
|
+
end
|
13
|
+
def nodes_with_itemprop(itemprop)
|
14
|
+
recipe_ast ? recipe_ast.css("[itemprop = \"#{itemprop}\"]") : NullObject.new
|
15
|
+
end
|
16
|
+
def parse_author
|
17
|
+
author_node = node_with_itemprop(:author)
|
18
|
+
author = if author_node['itemtype'] == "http://schema.org/Person"
|
19
|
+
author_node.css('[itemprop = "name"]').first['content']
|
20
|
+
else
|
21
|
+
author_node.content
|
22
|
+
end
|
23
|
+
clean_string author
|
24
|
+
end
|
25
|
+
def parse_cook_time
|
26
|
+
parse_time(:cookTime)
|
27
|
+
end
|
28
|
+
def parse_description
|
29
|
+
clean_string node_with_itemprop(:description).content
|
30
|
+
end
|
31
|
+
def parse_ingredients
|
32
|
+
nodes_with_itemprop(:ingredients).map(&:content).map do |ingredient|
|
33
|
+
# remove newlines and excess whitespace from ingredients
|
34
|
+
clean_string ingredient
|
35
|
+
end
|
36
|
+
end
|
37
|
+
def parse_instructions
|
38
|
+
clean_string node_with_itemprop(:recipeInstructions).content
|
39
|
+
end
|
40
|
+
def parse_name
|
41
|
+
clean_string node_with_itemprop(:name).content
|
42
|
+
end
|
43
|
+
def parse_prep_time
|
44
|
+
parse_time(:prepTime)
|
45
|
+
end
|
46
|
+
def parse_published_date
|
47
|
+
content = node_with_itemprop(:datePublished)['content']
|
48
|
+
content.blank? ? nil : Date.parse(content)
|
49
|
+
end
|
50
|
+
def parse_time(type)
|
51
|
+
node = node_with_itemprop(type)
|
52
|
+
iso8601_string = if node['content'].present?
|
53
|
+
node['content'] # foodnetwork.com
|
54
|
+
else
|
55
|
+
node['datetime'] # allrecipes.com
|
56
|
+
end
|
57
|
+
parse_duration(iso8601_string)
|
58
|
+
rescue ISO8601::Errors::UnknownPattern
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
def parse_total_time
|
62
|
+
parse_time(:totalTime)
|
63
|
+
end
|
64
|
+
def parse_yield
|
65
|
+
clean_string node_with_itemprop(:recipeYield).content
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
data/lib/hangry/version.rb
CHANGED
data/lib/hangry.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
require "hangry/version"
|
2
|
+
require 'hangry/recipe_parser'
|
3
|
+
require 'hangry/hrecipe_parser'
|
4
|
+
require 'hangry/schema_org_recipe_parser'
|
2
5
|
require 'active_support/core_ext/object/blank'
|
3
6
|
require 'date'
|
4
7
|
require 'iso8601'
|
5
8
|
require "nokogiri"
|
6
9
|
|
7
10
|
module Hangry
|
8
|
-
def self.parse(html)
|
9
|
-
parse_schema_org_recipe(html)
|
10
|
-
end
|
11
|
-
|
12
11
|
RECIPE_ATTRIBUTES = [
|
13
12
|
:author,
|
14
13
|
:cook_time,
|
@@ -24,110 +23,13 @@ module Hangry
|
|
24
23
|
|
25
24
|
Recipe = Struct.new(*RECIPE_ATTRIBUTES)
|
26
25
|
|
27
|
-
def self.
|
28
|
-
SchemaOrgRecipeParser
|
29
|
-
|
30
|
-
|
31
|
-
class SchemaOrgRecipeParser
|
32
|
-
attr_reader :recipe_html
|
33
|
-
attr_accessor :recipe_ast, :recipe
|
34
|
-
|
35
|
-
def initialize(recipe_html)
|
36
|
-
@recipe_html = recipe_html
|
37
|
-
@recipe = Recipe.new
|
38
|
-
doc = Nokogiri::HTML(recipe_html)
|
39
|
-
self.recipe_ast = doc.css('[itemtype="http://schema.org/Recipe"]').first
|
40
|
-
end
|
41
|
-
|
42
|
-
def parse
|
43
|
-
RECIPE_ATTRIBUTES.each do |attribute|
|
44
|
-
attr_value = value(send("parse_#{attribute}"))
|
45
|
-
recipe.public_send("#{attribute}=", attr_value)
|
46
|
-
end
|
47
|
-
recipe
|
48
|
-
end
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
class NullObject
|
53
|
-
def method_missing(*args, &block)
|
54
|
-
self
|
55
|
-
end
|
56
|
-
def blank?; true; end
|
57
|
-
def present?; false; end
|
58
|
-
def to_a; []; end
|
59
|
-
def to_ary; []; end
|
60
|
-
def to_s; ""; end
|
61
|
-
def to_str; ""; end
|
62
|
-
def to_f; 0.0; end
|
63
|
-
def to_i; 0; end
|
64
|
-
end
|
65
|
-
|
66
|
-
def value(object)
|
67
|
-
case object
|
68
|
-
when NullObject then nil
|
69
|
-
else object
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
def node_with_itemprop(itemprop)
|
74
|
-
nodes_with_itemprop(itemprop).first || NullObject.new
|
75
|
-
end
|
76
|
-
def nodes_with_itemprop(itemprop)
|
77
|
-
recipe_ast ? recipe_ast.css("[itemprop = \"#{itemprop}\"]") : NullObject.new
|
78
|
-
end
|
79
|
-
def parse_author
|
80
|
-
author_node = node_with_itemprop(:author)
|
81
|
-
if author_node['itemtype'] == "http://schema.org/Person"
|
82
|
-
author_node.css('[itemprop = "name"]').first['content']
|
83
|
-
else
|
84
|
-
author_node.content
|
85
|
-
end
|
86
|
-
end
|
87
|
-
def parse_cook_time
|
88
|
-
parse_time(:cookTime)
|
89
|
-
end
|
90
|
-
def parse_description
|
91
|
-
node_with_itemprop(:description).content
|
92
|
-
end
|
93
|
-
def parse_ingredients
|
94
|
-
nodes_with_itemprop(:ingredients).map(&:content).map do |ingredient|
|
95
|
-
# remove newlines and excess whitespace from ingredients
|
96
|
-
ingredient.strip.gsub(/\s+/, ' ')
|
97
|
-
end
|
98
|
-
end
|
99
|
-
def parse_instructions
|
100
|
-
node_with_itemprop(:recipeInstructions).content.strip
|
101
|
-
end
|
102
|
-
def parse_name
|
103
|
-
node_with_itemprop(:name).content
|
104
|
-
end
|
105
|
-
def parse_prep_time
|
106
|
-
parse_time(:prepTime)
|
107
|
-
end
|
108
|
-
def parse_published_date
|
109
|
-
content = node_with_itemprop(:datePublished)['content']
|
110
|
-
content.blank? ? nil : Date.parse(content)
|
111
|
-
end
|
112
|
-
def parse_time(type)
|
113
|
-
node = node_with_itemprop(type)
|
114
|
-
iso8601_string = if node['content'].present?
|
115
|
-
node['content'] # foodnetwork.com
|
116
|
-
else
|
117
|
-
node['datetime'] # allrecipes.com
|
118
|
-
end
|
119
|
-
duration = ISO8601::Duration.new(iso8601_string)
|
120
|
-
duration.hours.to_i * 60 + duration.minutes.to_i
|
121
|
-
rescue ISO8601::Errors::UnknownPattern
|
122
|
-
nil
|
123
|
-
end
|
124
|
-
def parse_total_time
|
125
|
-
parse_time(:totalTime)
|
126
|
-
end
|
127
|
-
def parse_yield
|
128
|
-
node_with_itemprop(:recipeYield).content
|
129
|
-
end
|
26
|
+
def self.parse(html)
|
27
|
+
parser_classes = [SchemaOrgRecipeParser, HRecipeParser]
|
28
|
+
parsers = parser_classes.map { |klass| klass.new(html) }
|
29
|
+
parser = parsers.detect { |p| p.can_parse? }
|
130
30
|
|
31
|
+
parser ? parser.parse : Recipe.new
|
131
32
|
end
|
33
|
+
|
132
34
|
end
|
133
35
|
|