hangry 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -0
- data/lib/hangry/hrecipe_parser.rb +61 -0
- data/lib/hangry/recipe_parser.rb +61 -0
- data/lib/hangry/schema_org_recipe_parser.rb +70 -0
- data/lib/hangry/version.rb +1 -1
- data/lib/hangry.rb +9 -107
- data/spec/fixtures/epicurious.html +3610 -0
- data/spec/real_examples/epicurious_spec.rb +50 -0
- data/spec/real_examples/food_network_spec.rb +1 -1
- metadata +18 -11
data/README.md
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
module Hangry
|
2
|
+
class HRecipeParser < RecipeParser
|
3
|
+
|
4
|
+
def self.root_selector
|
5
|
+
'.hrecipe'
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
def node_with_class(klass)
|
11
|
+
nodes_with_class(klass).first || NullObject.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def nodes_with_class(klass)
|
15
|
+
recipe_ast.css(".#{klass}")
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse_author
|
19
|
+
clean_string node_with_class(:author).content
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_cook_time
|
23
|
+
#TODO
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_description
|
27
|
+
clean_string node_with_class(:summary).content
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse_ingredients
|
31
|
+
nodes_with_class(:ingredient).map(&:content).map { |ingredient| clean_string ingredient }
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse_instructions
|
35
|
+
clean_string node_with_class(:instructions).content
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_name
|
39
|
+
clean_string node_with_class(:fn).content
|
40
|
+
end
|
41
|
+
|
42
|
+
def parse_prep_time
|
43
|
+
#TODO
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_published_date
|
47
|
+
#TODO
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse_total_time
|
51
|
+
parse_duration node_with_class(:duration).css('.value-title').first['title']
|
52
|
+
end
|
53
|
+
|
54
|
+
def parse_yield
|
55
|
+
clean_string node_with_class(:yield).content
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Hangry
|
2
|
+
class RecipeParser
|
3
|
+
attr_reader :recipe_html
|
4
|
+
attr_accessor :recipe_ast, :recipe
|
5
|
+
|
6
|
+
def initialize(recipe_html)
|
7
|
+
@recipe_html = recipe_html
|
8
|
+
@recipe = Recipe.new
|
9
|
+
doc = Nokogiri::HTML(recipe_html)
|
10
|
+
self.recipe_ast = doc.css(self.class.root_selector).first
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse
|
14
|
+
RECIPE_ATTRIBUTES.each do |attribute|
|
15
|
+
attr_value = value(send("parse_#{attribute}"))
|
16
|
+
recipe.public_send("#{attribute}=", attr_value)
|
17
|
+
end
|
18
|
+
recipe
|
19
|
+
end
|
20
|
+
|
21
|
+
def can_parse?
|
22
|
+
recipe_ast
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
class NullObject
|
28
|
+
def method_missing(*args, &block)
|
29
|
+
self
|
30
|
+
end
|
31
|
+
def blank?; true; end
|
32
|
+
def present?; false; end
|
33
|
+
def to_a; []; end
|
34
|
+
def to_ary; []; end
|
35
|
+
def to_s; ""; end
|
36
|
+
def to_str; ""; end
|
37
|
+
def to_f; 0.0; end
|
38
|
+
def to_i; 0; end
|
39
|
+
end
|
40
|
+
|
41
|
+
def value(object)
|
42
|
+
case object
|
43
|
+
when NullObject then nil
|
44
|
+
else object
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def clean_string(string)
|
49
|
+
string.strip.gsub(/\s+/, ' ')
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_duration(iso8601_string)
|
53
|
+
duration = ISO8601::Duration.new(iso8601_string)
|
54
|
+
duration.hours.to_i * 60 + duration.minutes.to_i
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Hangry
|
2
|
+
class SchemaOrgRecipeParser < RecipeParser
|
3
|
+
|
4
|
+
def self.root_selector
|
5
|
+
'[itemtype="http://schema.org/Recipe"]'
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
def node_with_itemprop(itemprop)
|
11
|
+
nodes_with_itemprop(itemprop).first || NullObject.new
|
12
|
+
end
|
13
|
+
def nodes_with_itemprop(itemprop)
|
14
|
+
recipe_ast ? recipe_ast.css("[itemprop = \"#{itemprop}\"]") : NullObject.new
|
15
|
+
end
|
16
|
+
def parse_author
|
17
|
+
author_node = node_with_itemprop(:author)
|
18
|
+
author = if author_node['itemtype'] == "http://schema.org/Person"
|
19
|
+
author_node.css('[itemprop = "name"]').first['content']
|
20
|
+
else
|
21
|
+
author_node.content
|
22
|
+
end
|
23
|
+
clean_string author
|
24
|
+
end
|
25
|
+
def parse_cook_time
|
26
|
+
parse_time(:cookTime)
|
27
|
+
end
|
28
|
+
def parse_description
|
29
|
+
clean_string node_with_itemprop(:description).content
|
30
|
+
end
|
31
|
+
def parse_ingredients
|
32
|
+
nodes_with_itemprop(:ingredients).map(&:content).map do |ingredient|
|
33
|
+
# remove newlines and excess whitespace from ingredients
|
34
|
+
clean_string ingredient
|
35
|
+
end
|
36
|
+
end
|
37
|
+
def parse_instructions
|
38
|
+
clean_string node_with_itemprop(:recipeInstructions).content
|
39
|
+
end
|
40
|
+
def parse_name
|
41
|
+
clean_string node_with_itemprop(:name).content
|
42
|
+
end
|
43
|
+
def parse_prep_time
|
44
|
+
parse_time(:prepTime)
|
45
|
+
end
|
46
|
+
def parse_published_date
|
47
|
+
content = node_with_itemprop(:datePublished)['content']
|
48
|
+
content.blank? ? nil : Date.parse(content)
|
49
|
+
end
|
50
|
+
def parse_time(type)
|
51
|
+
node = node_with_itemprop(type)
|
52
|
+
iso8601_string = if node['content'].present?
|
53
|
+
node['content'] # foodnetwork.com
|
54
|
+
else
|
55
|
+
node['datetime'] # allrecipes.com
|
56
|
+
end
|
57
|
+
parse_duration(iso8601_string)
|
58
|
+
rescue ISO8601::Errors::UnknownPattern
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
def parse_total_time
|
62
|
+
parse_time(:totalTime)
|
63
|
+
end
|
64
|
+
def parse_yield
|
65
|
+
clean_string node_with_itemprop(:recipeYield).content
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
data/lib/hangry/version.rb
CHANGED
data/lib/hangry.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
require "hangry/version"
|
2
|
+
require 'hangry/recipe_parser'
|
3
|
+
require 'hangry/hrecipe_parser'
|
4
|
+
require 'hangry/schema_org_recipe_parser'
|
2
5
|
require 'active_support/core_ext/object/blank'
|
3
6
|
require 'date'
|
4
7
|
require 'iso8601'
|
5
8
|
require "nokogiri"
|
6
9
|
|
7
10
|
module Hangry
|
8
|
-
def self.parse(html)
|
9
|
-
parse_schema_org_recipe(html)
|
10
|
-
end
|
11
|
-
|
12
11
|
RECIPE_ATTRIBUTES = [
|
13
12
|
:author,
|
14
13
|
:cook_time,
|
@@ -24,110 +23,13 @@ module Hangry
|
|
24
23
|
|
25
24
|
Recipe = Struct.new(*RECIPE_ATTRIBUTES)
|
26
25
|
|
27
|
-
def self.
|
28
|
-
SchemaOrgRecipeParser
|
29
|
-
|
30
|
-
|
31
|
-
class SchemaOrgRecipeParser
|
32
|
-
attr_reader :recipe_html
|
33
|
-
attr_accessor :recipe_ast, :recipe
|
34
|
-
|
35
|
-
def initialize(recipe_html)
|
36
|
-
@recipe_html = recipe_html
|
37
|
-
@recipe = Recipe.new
|
38
|
-
doc = Nokogiri::HTML(recipe_html)
|
39
|
-
self.recipe_ast = doc.css('[itemtype="http://schema.org/Recipe"]').first
|
40
|
-
end
|
41
|
-
|
42
|
-
def parse
|
43
|
-
RECIPE_ATTRIBUTES.each do |attribute|
|
44
|
-
attr_value = value(send("parse_#{attribute}"))
|
45
|
-
recipe.public_send("#{attribute}=", attr_value)
|
46
|
-
end
|
47
|
-
recipe
|
48
|
-
end
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
class NullObject
|
53
|
-
def method_missing(*args, &block)
|
54
|
-
self
|
55
|
-
end
|
56
|
-
def blank?; true; end
|
57
|
-
def present?; false; end
|
58
|
-
def to_a; []; end
|
59
|
-
def to_ary; []; end
|
60
|
-
def to_s; ""; end
|
61
|
-
def to_str; ""; end
|
62
|
-
def to_f; 0.0; end
|
63
|
-
def to_i; 0; end
|
64
|
-
end
|
65
|
-
|
66
|
-
def value(object)
|
67
|
-
case object
|
68
|
-
when NullObject then nil
|
69
|
-
else object
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
def node_with_itemprop(itemprop)
|
74
|
-
nodes_with_itemprop(itemprop).first || NullObject.new
|
75
|
-
end
|
76
|
-
def nodes_with_itemprop(itemprop)
|
77
|
-
recipe_ast ? recipe_ast.css("[itemprop = \"#{itemprop}\"]") : NullObject.new
|
78
|
-
end
|
79
|
-
def parse_author
|
80
|
-
author_node = node_with_itemprop(:author)
|
81
|
-
if author_node['itemtype'] == "http://schema.org/Person"
|
82
|
-
author_node.css('[itemprop = "name"]').first['content']
|
83
|
-
else
|
84
|
-
author_node.content
|
85
|
-
end
|
86
|
-
end
|
87
|
-
def parse_cook_time
|
88
|
-
parse_time(:cookTime)
|
89
|
-
end
|
90
|
-
def parse_description
|
91
|
-
node_with_itemprop(:description).content
|
92
|
-
end
|
93
|
-
def parse_ingredients
|
94
|
-
nodes_with_itemprop(:ingredients).map(&:content).map do |ingredient|
|
95
|
-
# remove newlines and excess whitespace from ingredients
|
96
|
-
ingredient.strip.gsub(/\s+/, ' ')
|
97
|
-
end
|
98
|
-
end
|
99
|
-
def parse_instructions
|
100
|
-
node_with_itemprop(:recipeInstructions).content.strip
|
101
|
-
end
|
102
|
-
def parse_name
|
103
|
-
node_with_itemprop(:name).content
|
104
|
-
end
|
105
|
-
def parse_prep_time
|
106
|
-
parse_time(:prepTime)
|
107
|
-
end
|
108
|
-
def parse_published_date
|
109
|
-
content = node_with_itemprop(:datePublished)['content']
|
110
|
-
content.blank? ? nil : Date.parse(content)
|
111
|
-
end
|
112
|
-
def parse_time(type)
|
113
|
-
node = node_with_itemprop(type)
|
114
|
-
iso8601_string = if node['content'].present?
|
115
|
-
node['content'] # foodnetwork.com
|
116
|
-
else
|
117
|
-
node['datetime'] # allrecipes.com
|
118
|
-
end
|
119
|
-
duration = ISO8601::Duration.new(iso8601_string)
|
120
|
-
duration.hours.to_i * 60 + duration.minutes.to_i
|
121
|
-
rescue ISO8601::Errors::UnknownPattern
|
122
|
-
nil
|
123
|
-
end
|
124
|
-
def parse_total_time
|
125
|
-
parse_time(:totalTime)
|
126
|
-
end
|
127
|
-
def parse_yield
|
128
|
-
node_with_itemprop(:recipeYield).content
|
129
|
-
end
|
26
|
+
def self.parse(html)
|
27
|
+
parser_classes = [SchemaOrgRecipeParser, HRecipeParser]
|
28
|
+
parsers = parser_classes.map { |klass| klass.new(html) }
|
29
|
+
parser = parsers.detect { |p| p.can_parse? }
|
130
30
|
|
31
|
+
parser ? parser.parse : Recipe.new
|
131
32
|
end
|
33
|
+
|
132
34
|
end
|
133
35
|
|