tychus 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cd51871804589693024dadac44681f0c4a130985
4
- data.tar.gz: 3b50040346e436f0e8d1b5afc467ab5cea740615
3
+ metadata.gz: 31bfca329642b6f6ff3ea7e03c5e51c6ddc19dc1
4
+ data.tar.gz: cc5950e7db596ed6f2726d8eeeb139a15ef3a543
5
5
  SHA512:
6
- metadata.gz: 54bd3b6f23e870586f14ae2b7eceabd7d28b9c3ac93d12cdeece96df71832a2de438fc88bee8e6218da454eb931e3acd45f1f4e1e9a177ef4cda896f41e96bce
7
- data.tar.gz: e72d935d9102885d2f2e8b93303826cb1fec941b2f998ef9bd9cb864eb78d3b365870dc6ee037421d498e0db116932000622419234c7505143046a9aee108548
6
+ metadata.gz: 5373215f9f13c69a02bf53af7d67a41d2abe42485cec37f3fb530c0ca7f24b442c070fad454b595473b45fb8262b9dd37ee7021cc97c09202ae839f05267f811
7
+ data.tar.gz: 4a75a2251cea8003277b304eaf596746578f4ec2ec7d1a0644e0cc904424514066c98a5c0bc2eeeacca4e1a836ef962cd3cf7d8d28132dafb3137b9359ffca88
@@ -1,7 +1,6 @@
1
1
  module Tychus
2
2
  module Parsers
3
3
 
4
- # Allrecipes uses schema.org's recipe microformat
5
4
  class AllrecipesParser < SchemaOrgParser
6
5
  def self.uri_host
7
6
  "allrecipes.com"
@@ -32,7 +32,7 @@ module Parsers
32
32
  @uri = uri
33
33
  @recipe = Recipe.new
34
34
  @doc = Nokogiri::HTML(open(uri))
35
- @recipe_doc = @doc.css(self.class.root_doc)
35
+ @recipe_doc = @doc.css(root_doc)
36
36
  end
37
37
 
38
38
  def parse
@@ -43,94 +43,6 @@ module Parsers
43
43
  recipe
44
44
  end
45
45
 
46
- def parse_author
47
- # is it always first?
48
- itemprop_node_for(:author).content
49
- end
50
-
51
- def parse_description
52
- # is it always first?
53
- itemprop_node_for(:description).content
54
- end
55
-
56
- def parse_recipe_instructions
57
- # strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
58
- #
59
- # Allrecipes: <li><span>lorem ipsum</span></li>
60
- # FoodNetwork: <p>lorem ipsum</p>
61
- # reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
62
- reject_regex = /^(h.|div)$/
63
-
64
- clean_instructions(itemprop_node_for(:recipeInstructions)
65
- .element_children
66
- .reject { |node| node.name =~ reject_regex }
67
- .map do |node|
68
- node.content
69
- .squeeze(" ")
70
- .rstrip
71
- .split("\r\n\s\r\n\s")
72
- end.flatten.reject(&:blank?))
73
- end
74
-
75
- def parse_name
76
- # is it always first?
77
- itemprop_node_for(:name).content
78
- end
79
-
80
- def parse_cook_time
81
- # is it always first?
82
- # leverage iso8601
83
- parse_duration(itemprop_node_for(:cookTime))
84
- end
85
-
86
- def parse_image
87
- # is it always first?
88
- itemprop_node_for(:image).attr('src')
89
- end
90
-
91
- def parse_ingredients
92
- # NOT FIRST
93
- recipe_doc
94
- .css('[itemprop="ingredients"]')
95
- .map do |ingredient_node|
96
- ingredient_node
97
- .element_children
98
- .map(&:content)
99
- .join(" ")
100
- end.reject(&:blank?)
101
- end
102
-
103
- def parse_prep_time
104
- # is it always first?
105
- # leverage iso8601
106
- parse_duration(itemprop_node_for(:prepTime))
107
- end
108
-
109
- def parse_duration(node)
110
- # Allrecipes - 'time' element
111
- # Foodnetwork - 'meta' element (std according to
112
- # Schema.org/Recipe)
113
- case node.name
114
- when "meta", "span"
115
- node.attr('content')
116
- when "time"
117
- node.attr('datetime')
118
- else
119
- NullObject.new
120
- end
121
- end
122
-
123
- def parse_recipe_yield
124
- # is it always first?
125
- itemprop_node_for(:recipeYield).content
126
- end
127
-
128
- def parse_total_time
129
- # is it always first?
130
- # leverage iso8601
131
- parse_duration(itemprop_node_for(:totalTime))
132
- end
133
-
134
46
  def recipe_attributes
135
47
  self.class.recipe_attributes
136
48
  end
@@ -7,6 +7,7 @@ module Parsers
7
7
 
8
8
  def parse_author
9
9
  # in the case of an author advertising her TV show
10
+ # TODO: test case where the author is _not_
10
11
  itemprop_node_for(:author)
11
12
  .css('span')
12
13
  .first
@@ -21,17 +22,10 @@ module Parsers
21
22
  end
22
23
 
23
24
  def parse_ingredients
24
- # NOT FIRST
25
25
  recipe_doc
26
26
  .css('[itemprop="ingredients"]')
27
27
  .map { |node| node.content.lstrip.squeeze(" ").chomp }
28
28
  end
29
-
30
- def clean_instructions(obj)
31
- #TODO: what is best pattern to share this behavior?
32
- obj
33
- end
34
-
35
29
  end
36
30
  end
37
31
  end
@@ -3,37 +3,108 @@ module Parsers
3
3
 
4
4
  class SchemaOrgParser < Base
5
5
 
6
+ attr_reader :root_doc, :review_doc, :video_object_doc
7
+
6
8
  def initialize(uri)
9
+ @root_doc = '[itemtype="http://schema.org/Recipe"]'
10
+ @review_doc = '[itemtype="http://schema.org/Review"]'
11
+ @video_object_doc = '[itemtype="http://www.schema.org/VideoObject"]'
7
12
  super
8
13
  strip_review_microformat
9
14
  strip_video_object_microformat
10
15
  end
11
16
 
12
- def strip_review_microformat
13
- recipe_doc.css(self.class.review_doc).remove
17
+ def parse_author
18
+ itemprop_node_for(:author).content
14
19
  end
15
20
 
16
- def strip_video_object_microformat
17
- recipe_doc.css(self.class.video_object_doc).remove
21
+ def parse_description
22
+ # is it always first?
23
+ itemprop_node_for(:description).content
18
24
  end
19
25
 
26
+ def parse_recipe_instructions
27
+ # strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
28
+ #
29
+ # Allrecipes: <li><span>lorem ipsum</span></li>
30
+ # FoodNetwork: <p>lorem ipsum</p>
31
+ # reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
32
+ reject_regex = /^(h.|div)$/
20
33
 
21
- def self.root_doc
22
- '[itemtype="http://schema.org/Recipe"]'
34
+ clean_instructions(itemprop_node_for(:recipeInstructions)
35
+ .element_children
36
+ .reject { |node| node.name =~ reject_regex }
37
+ .map do |node|
38
+ node.content
39
+ .squeeze(" ")
40
+ .rstrip
41
+ .split("\r\n\s\r\n\s")
42
+ end.flatten.reject(&:blank?))
23
43
  end
24
44
 
25
- def itemprop_node_for(property)
26
- recipe_doc.css("[itemprop='#{property}']").first || NullObject.new
45
+ def parse_name
46
+ itemprop_node_for(:name).content
47
+ end
48
+
49
+ def parse_cook_time
50
+ # leverage iso8601
51
+ parse_duration(itemprop_node_for(:cookTime))
52
+ end
53
+
54
+ def parse_image
55
+ itemprop_node_for(:image).attr('src')
56
+ end
57
+
58
+ def parse_ingredients
59
+ # NOT FIRST
60
+ recipe_doc
61
+ .css('[itemprop="ingredients"]')
62
+ .map do |ingredient_node|
63
+ ingredient_node
64
+ .element_children
65
+ .map(&:content)
66
+ .join(" ")
67
+ end.reject(&:blank?)
68
+ end
69
+
70
+ def parse_prep_time
71
+ parse_duration(itemprop_node_for(:prepTime))
72
+ end
73
+
74
+ def parse_duration(node)
75
+ # Allrecipes - 'time' element
76
+ # Foodnetwork - 'meta' element (std according to
77
+ # Schema.org/Recipe)
78
+ case node.name
79
+ when "meta", "span"
80
+ node.attr('content')
81
+ when "time"
82
+ node.attr('datetime')
83
+ else
84
+ NullObject.new
85
+ end
86
+ end
87
+
88
+ def parse_recipe_yield
89
+ itemprop_node_for(:recipeYield).content
90
+ end
91
+
92
+ def parse_total_time
93
+ # leverage iso8601
94
+ parse_duration(itemprop_node_for(:totalTime))
27
95
  end
28
96
 
29
- def self.review_doc
30
- '[itemtype="http://schema.org/Review"]'
97
+ def strip_review_microformat
98
+ recipe_doc.css(review_doc).remove
31
99
  end
32
100
 
33
- def self.video_object_doc
34
- '[itemtype="http://www.schema.org/VideoObject"]'
101
+ def strip_video_object_microformat
102
+ recipe_doc.css(video_object_doc).remove
35
103
  end
36
104
 
105
+ def itemprop_node_for(property)
106
+ recipe_doc.css("[itemprop='#{property}']").first || NullObject.new
107
+ end
37
108
  end
38
109
 
39
110
  end
@@ -1,3 +1,3 @@
1
1
  module Tychus
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -4,11 +4,11 @@ describe Tychus::Parsers::SchemaOrgParser do
4
4
  let(:parser) { Tychus::Parsers::AllrecipesParser.new(allrecipes_uri) }
5
5
 
6
6
  it "strips the Review microformat from node to prevent name collisions with item properties of different microformats" do
7
- expect(parser.recipe_doc.css(parser.class.review_doc)).to be_empty
7
+ expect(parser.recipe_doc.css(parser.review_doc)).to be_empty
8
8
  end
9
9
 
10
10
  it "strips the videoObject microformat from node to prevent name collisions with item properties of different microformats" do
11
- expect(parser.recipe_doc.css(parser.class.video_object_doc)).to be_empty
11
+ expect(parser.recipe_doc.css(parser.video_object_doc)).to be_empty
12
12
  end
13
13
 
14
14
  pending "find a non schema org recipe to test that it does not attempt to call #strip_review_microformat"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tychus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Wayne Yang
@@ -171,8 +171,6 @@ files:
171
171
  - lib/tychus/parsers/food_network_parser.rb
172
172
  - lib/tychus/parsers/kraft_recipes_parser.rb
173
173
  - lib/tychus/parsers/schema_org_parser.rb
174
- - lib/tychus/recipe.rb
175
- - lib/tychus/uri_parser.rb
176
174
  - lib/tychus/uri_resolver.rb
177
175
  - lib/tychus/utilities/url_parser.rb
178
176
  - lib/tychus/version.rb
data/lib/tychus/recipe.rb DELETED
File without changes
@@ -1,40 +0,0 @@
1
- # This will attempt to resolve a host for the uri
2
- # by first attempting to resolve its canonical uri (if it exists)
3
- # and then will return the uri's host and the parsed nokogiri object
4
- # Since in order to get the canonical uri, we have to use Nokogiri to
5
- # find a link tag, is it too much to expect this class to act as an
6
- # uri parser (using Addressable) and a Nokogiri object maker?
7
-
8
- require 'addressable/uri'
9
- require 'open-uri'
10
-
11
- module Tychus
12
- class URIResolver
13
- attr_reader :uri, :doc
14
-
15
- def initialize(uri)
16
- @uri = uri
17
- @schema_org_canonical_uri_property = 'link[rel="canonical"]'
18
- @open_graph_canonical_uri_property = 'meta[property="og:url"]'
19
- @doc = Nokogiri::HTML(open(uri))
20
- end
21
-
22
- def resolve_uri
23
- # try to retrieve host from canonical uri in markup
24
- # else resort to given uri
25
- canonical_uri(schema_org_canonical_uri_property).presence || \
26
- canonical_uri(open_graph_canonical_uri_property) || \
27
- Addressible::URI.parse(uri).host
28
- end
29
-
30
- def canonical_uri(property)
31
- case property
32
- when schema_org_canonical_uri_property
33
- doc.css(property).first['href']
34
- when open_graph_canonical_uri_property
35
- doc.css(property).first['content']
36
- end
37
- end
38
-
39
- end
40
- end