tychus 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/tychus/parsers/allrecipes_parser.rb +0 -1
- data/lib/tychus/parsers/base.rb +1 -89
- data/lib/tychus/parsers/food_network_parser.rb +1 -7
- data/lib/tychus/parsers/schema_org_parser.rb +83 -12
- data/lib/tychus/version.rb +1 -1
- data/spec/parsers/schema_org_parser_spec.rb +2 -2
- metadata +1 -3
- data/lib/tychus/recipe.rb +0 -0
- data/lib/tychus/uri_parser.rb +0 -40
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 31bfca329642b6f6ff3ea7e03c5e51c6ddc19dc1
|
4
|
+
data.tar.gz: cc5950e7db596ed6f2726d8eeeb139a15ef3a543
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5373215f9f13c69a02bf53af7d67a41d2abe42485cec37f3fb530c0ca7f24b442c070fad454b595473b45fb8262b9dd37ee7021cc97c09202ae839f05267f811
|
7
|
+
data.tar.gz: 4a75a2251cea8003277b304eaf596746578f4ec2ec7d1a0644e0cc904424514066c98a5c0bc2eeeacca4e1a836ef962cd3cf7d8d28132dafb3137b9359ffca88
|
data/lib/tychus/parsers/base.rb
CHANGED
@@ -32,7 +32,7 @@ module Parsers
|
|
32
32
|
@uri = uri
|
33
33
|
@recipe = Recipe.new
|
34
34
|
@doc = Nokogiri::HTML(open(uri))
|
35
|
-
@recipe_doc = @doc.css(
|
35
|
+
@recipe_doc = @doc.css(root_doc)
|
36
36
|
end
|
37
37
|
|
38
38
|
def parse
|
@@ -43,94 +43,6 @@ module Parsers
|
|
43
43
|
recipe
|
44
44
|
end
|
45
45
|
|
46
|
-
def parse_author
|
47
|
-
# is it always first?
|
48
|
-
itemprop_node_for(:author).content
|
49
|
-
end
|
50
|
-
|
51
|
-
def parse_description
|
52
|
-
# is it always first?
|
53
|
-
itemprop_node_for(:description).content
|
54
|
-
end
|
55
|
-
|
56
|
-
def parse_recipe_instructions
|
57
|
-
# strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
|
58
|
-
#
|
59
|
-
# Allrecipes: <li><span>lorem ipsum</span></li>
|
60
|
-
# FoodNetwork: <p>lorem ipsum</p>
|
61
|
-
# reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
|
62
|
-
reject_regex = /^(h.|div)$/
|
63
|
-
|
64
|
-
clean_instructions(itemprop_node_for(:recipeInstructions)
|
65
|
-
.element_children
|
66
|
-
.reject { |node| node.name =~ reject_regex }
|
67
|
-
.map do |node|
|
68
|
-
node.content
|
69
|
-
.squeeze(" ")
|
70
|
-
.rstrip
|
71
|
-
.split("\r\n\s\r\n\s")
|
72
|
-
end.flatten.reject(&:blank?))
|
73
|
-
end
|
74
|
-
|
75
|
-
def parse_name
|
76
|
-
# is it always first?
|
77
|
-
itemprop_node_for(:name).content
|
78
|
-
end
|
79
|
-
|
80
|
-
def parse_cook_time
|
81
|
-
# is it always first?
|
82
|
-
# leverage iso8601
|
83
|
-
parse_duration(itemprop_node_for(:cookTime))
|
84
|
-
end
|
85
|
-
|
86
|
-
def parse_image
|
87
|
-
# is it always first?
|
88
|
-
itemprop_node_for(:image).attr('src')
|
89
|
-
end
|
90
|
-
|
91
|
-
def parse_ingredients
|
92
|
-
# NOT FIRST
|
93
|
-
recipe_doc
|
94
|
-
.css('[itemprop="ingredients"]')
|
95
|
-
.map do |ingredient_node|
|
96
|
-
ingredient_node
|
97
|
-
.element_children
|
98
|
-
.map(&:content)
|
99
|
-
.join(" ")
|
100
|
-
end.reject(&:blank?)
|
101
|
-
end
|
102
|
-
|
103
|
-
def parse_prep_time
|
104
|
-
# is it always first?
|
105
|
-
# leverage iso8601
|
106
|
-
parse_duration(itemprop_node_for(:prepTime))
|
107
|
-
end
|
108
|
-
|
109
|
-
def parse_duration(node)
|
110
|
-
# Allrecipes - 'time' element
|
111
|
-
# Foodnetwork - 'meta' element (std according to
|
112
|
-
# Schema.org/Recipe)
|
113
|
-
case node.name
|
114
|
-
when "meta", "span"
|
115
|
-
node.attr('content')
|
116
|
-
when "time"
|
117
|
-
node.attr('datetime')
|
118
|
-
else
|
119
|
-
NullObject.new
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
def parse_recipe_yield
|
124
|
-
# is it always first?
|
125
|
-
itemprop_node_for(:recipeYield).content
|
126
|
-
end
|
127
|
-
|
128
|
-
def parse_total_time
|
129
|
-
# is it always first?
|
130
|
-
# leverage iso8601
|
131
|
-
parse_duration(itemprop_node_for(:totalTime))
|
132
|
-
end
|
133
|
-
|
134
46
|
def recipe_attributes
|
135
47
|
self.class.recipe_attributes
|
136
48
|
end
|
@@ -7,6 +7,7 @@ module Parsers
|
|
7
7
|
|
8
8
|
def parse_author
|
9
9
|
# in the case of an author advertising her TV show
|
10
|
+
# TODO: test case where the author is _not_
|
10
11
|
itemprop_node_for(:author)
|
11
12
|
.css('span')
|
12
13
|
.first
|
@@ -21,17 +22,10 @@ module Parsers
|
|
21
22
|
end
|
22
23
|
|
23
24
|
def parse_ingredients
|
24
|
-
# NOT FIRST
|
25
25
|
recipe_doc
|
26
26
|
.css('[itemprop="ingredients"]')
|
27
27
|
.map { |node| node.content.lstrip.squeeze(" ").chomp }
|
28
28
|
end
|
29
|
-
|
30
|
-
def clean_instructions(obj)
|
31
|
-
#TODO: what is best pattern to share this behavior?
|
32
|
-
obj
|
33
|
-
end
|
34
|
-
|
35
29
|
end
|
36
30
|
end
|
37
31
|
end
|
@@ -3,37 +3,108 @@ module Parsers
|
|
3
3
|
|
4
4
|
class SchemaOrgParser < Base
|
5
5
|
|
6
|
+
attr_reader :root_doc, :review_doc, :video_object_doc
|
7
|
+
|
6
8
|
def initialize(uri)
|
9
|
+
@root_doc = '[itemtype="http://schema.org/Recipe"]'
|
10
|
+
@review_doc = '[itemtype="http://schema.org/Review"]'
|
11
|
+
@video_object_doc = '[itemtype="http://www.schema.org/VideoObject"]'
|
7
12
|
super
|
8
13
|
strip_review_microformat
|
9
14
|
strip_video_object_microformat
|
10
15
|
end
|
11
16
|
|
12
|
-
def
|
13
|
-
|
17
|
+
def parse_author
|
18
|
+
itemprop_node_for(:author).content
|
14
19
|
end
|
15
20
|
|
16
|
-
def
|
17
|
-
|
21
|
+
def parse_description
|
22
|
+
# is it always first?
|
23
|
+
itemprop_node_for(:description).content
|
18
24
|
end
|
19
25
|
|
26
|
+
def parse_recipe_instructions
|
27
|
+
# strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
|
28
|
+
#
|
29
|
+
# Allrecipes: <li><span>lorem ipsum</span></li>
|
30
|
+
# FoodNetwork: <p>lorem ipsum</p>
|
31
|
+
# reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
|
32
|
+
reject_regex = /^(h.|div)$/
|
20
33
|
|
21
|
-
|
22
|
-
|
34
|
+
clean_instructions(itemprop_node_for(:recipeInstructions)
|
35
|
+
.element_children
|
36
|
+
.reject { |node| node.name =~ reject_regex }
|
37
|
+
.map do |node|
|
38
|
+
node.content
|
39
|
+
.squeeze(" ")
|
40
|
+
.rstrip
|
41
|
+
.split("\r\n\s\r\n\s")
|
42
|
+
end.flatten.reject(&:blank?))
|
23
43
|
end
|
24
44
|
|
25
|
-
def
|
26
|
-
|
45
|
+
def parse_name
|
46
|
+
itemprop_node_for(:name).content
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_cook_time
|
50
|
+
# leverage iso8601
|
51
|
+
parse_duration(itemprop_node_for(:cookTime))
|
52
|
+
end
|
53
|
+
|
54
|
+
def parse_image
|
55
|
+
itemprop_node_for(:image).attr('src')
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_ingredients
|
59
|
+
# NOT FIRST
|
60
|
+
recipe_doc
|
61
|
+
.css('[itemprop="ingredients"]')
|
62
|
+
.map do |ingredient_node|
|
63
|
+
ingredient_node
|
64
|
+
.element_children
|
65
|
+
.map(&:content)
|
66
|
+
.join(" ")
|
67
|
+
end.reject(&:blank?)
|
68
|
+
end
|
69
|
+
|
70
|
+
def parse_prep_time
|
71
|
+
parse_duration(itemprop_node_for(:prepTime))
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse_duration(node)
|
75
|
+
# Allrecipes - 'time' element
|
76
|
+
# Foodnetwork - 'meta' element (std according to
|
77
|
+
# Schema.org/Recipe)
|
78
|
+
case node.name
|
79
|
+
when "meta", "span"
|
80
|
+
node.attr('content')
|
81
|
+
when "time"
|
82
|
+
node.attr('datetime')
|
83
|
+
else
|
84
|
+
NullObject.new
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def parse_recipe_yield
|
89
|
+
itemprop_node_for(:recipeYield).content
|
90
|
+
end
|
91
|
+
|
92
|
+
def parse_total_time
|
93
|
+
# leverage iso8601
|
94
|
+
parse_duration(itemprop_node_for(:totalTime))
|
27
95
|
end
|
28
96
|
|
29
|
-
def
|
30
|
-
|
97
|
+
def strip_review_microformat
|
98
|
+
recipe_doc.css(review_doc).remove
|
31
99
|
end
|
32
100
|
|
33
|
-
def
|
34
|
-
|
101
|
+
def strip_video_object_microformat
|
102
|
+
recipe_doc.css(video_object_doc).remove
|
35
103
|
end
|
36
104
|
|
105
|
+
def itemprop_node_for(property)
|
106
|
+
recipe_doc.css("[itemprop='#{property}']").first || NullObject.new
|
107
|
+
end
|
37
108
|
end
|
38
109
|
|
39
110
|
end
|
data/lib/tychus/version.rb
CHANGED
@@ -4,11 +4,11 @@ describe Tychus::Parsers::SchemaOrgParser do
|
|
4
4
|
let(:parser) { Tychus::Parsers::AllrecipesParser.new(allrecipes_uri) }
|
5
5
|
|
6
6
|
it "strips the Review microformat from node to prevent name collisions with item properties of different microformats" do
|
7
|
-
expect(parser.recipe_doc.css(parser.
|
7
|
+
expect(parser.recipe_doc.css(parser.review_doc)).to be_empty
|
8
8
|
end
|
9
9
|
|
10
10
|
it "strips the videoObject microformat from node to prevent name collisions with item properties of different microformats" do
|
11
|
-
expect(parser.recipe_doc.css(parser.
|
11
|
+
expect(parser.recipe_doc.css(parser.video_object_doc)).to be_empty
|
12
12
|
end
|
13
13
|
|
14
14
|
pending "find a non schema org recipe to test that it does not attempt to call #strip_review_microformat"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tychus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Wayne Yang
|
@@ -171,8 +171,6 @@ files:
|
|
171
171
|
- lib/tychus/parsers/food_network_parser.rb
|
172
172
|
- lib/tychus/parsers/kraft_recipes_parser.rb
|
173
173
|
- lib/tychus/parsers/schema_org_parser.rb
|
174
|
-
- lib/tychus/recipe.rb
|
175
|
-
- lib/tychus/uri_parser.rb
|
176
174
|
- lib/tychus/uri_resolver.rb
|
177
175
|
- lib/tychus/utilities/url_parser.rb
|
178
176
|
- lib/tychus/version.rb
|
data/lib/tychus/recipe.rb
DELETED
File without changes
|
data/lib/tychus/uri_parser.rb
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
# This will attempt to resolve a host for the uri
|
2
|
-
# by first attempting to resolve its canonical uri (if it exists)
|
3
|
-
# and then will return the uri's host and the parsed nokogiri object
|
4
|
-
# Since in order to get the canonical uri, we have to use Nokogiri to
|
5
|
-
# find a link tag, is it too much to expect this class to act as an
|
6
|
-
# uri parser (using Addressable) and a Nokogiri object maker?
|
7
|
-
|
8
|
-
require 'addressable/uri'
|
9
|
-
require 'open-uri'
|
10
|
-
|
11
|
-
module Tychus
|
12
|
-
class URIResolver
|
13
|
-
attr_reader :uri, :doc
|
14
|
-
|
15
|
-
def initialize(uri)
|
16
|
-
@uri = uri
|
17
|
-
@schema_org_canonical_uri_property = 'link[rel="canonical"]'
|
18
|
-
@open_graph_canonical_uri_property = 'meta[property="og:url"]'
|
19
|
-
@doc = Nokogiri::HTML(open(uri))
|
20
|
-
end
|
21
|
-
|
22
|
-
def resolve_uri
|
23
|
-
# try to retrieve host from canonical uri in markup
|
24
|
-
# else resort to given uri
|
25
|
-
canonical_uri(schema_org_canonical_uri_property).presence || \
|
26
|
-
canonical_uri(open_graph_canonical_uri_property) || \
|
27
|
-
Addressible::URI.parse(uri).host
|
28
|
-
end
|
29
|
-
|
30
|
-
def canonical_uri(property)
|
31
|
-
case property
|
32
|
-
when schema_org_canonical_uri_property
|
33
|
-
doc.css(property).first['href']
|
34
|
-
when open_graph_canonical_uri_property
|
35
|
-
doc.css(property).first['content']
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
end
|
40
|
-
end
|