tychus 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +4 -4
  2. data/lib/tychus/meta_parser.rb +55 -0
  3. data/lib/tychus/parser_selector.rb +4 -3
  4. data/lib/tychus/parsers/campbells_kitchen_parser.rb +10 -0
  5. data/lib/tychus/parsers/open_graph_protocol_parser.rb +35 -0
  6. data/lib/tychus/parsers/schema_org_parser.rb +19 -19
  7. data/lib/tychus/parsers.rb +2 -0
  8. data/lib/tychus/uri_resolver.rb +3 -3
  9. data/lib/tychus/version.rb +1 -1
  10. data/lib/tychus.rb +3 -2
  11. data/spec/fixtures/cassettes/allrecipes_1.yml +14403 -0
  12. data/spec/fixtures/cassettes/campbells_kitchen_1.yml +3364 -0
  13. data/spec/fixtures/cassettes/food_network_single_ingredients_group_1.yml +4884 -0
  14. data/spec/fixtures/cassettes/kraft_recipes_1.yml +3638 -0
  15. data/spec/fixtures/cassettes/meta_parser_og_protocol_uri.yml +6733 -0
  16. data/spec/fixtures/cassettes/meta_parser_schema_org_microformat_uri.yml +9605 -0
  17. data/spec/meta_parser.rb +36 -0
  18. data/spec/parsers/allrecipes_parser_spec.rb +7 -6
  19. data/spec/parsers/campbells_kitchen_parser_spec.rb +71 -0
  20. data/spec/parsers/food_network_parser_spec.rb +7 -4
  21. data/spec/parsers/kraft_recipes_parser_spec.rb +6 -2
  22. data/spec/parsers/schema_org_parser_spec.rb +6 -2
  23. data/spec/spec_helper.rb +1 -1
  24. data/spec/uri_resolver_spec.rb +48 -10
  25. metadata +20 -11
  26. data/spec/fixtures/allrecipes.html +0 -3003
  27. data/spec/fixtures/campbellskitchen.html +0 -2190
  28. data/spec/fixtures/food_network_double_ingredients_group.html +0 -3725
  29. data/spec/fixtures/food_network_single_ingredients_group.html +0 -4930
  30. data/spec/fixtures/kraftrecipes.html +0 -2722
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6bc9018fe36889fe35e6f7c628bca5e4920b5fb4
4
- data.tar.gz: a6af0d410b9c61d59c618d447ff53df239b8ea30
3
+ metadata.gz: 67e310db920ab96a5b75531a2f33192ee9c2ffbe
4
+ data.tar.gz: c72aab160fbcad2c93d3336eb1e45b7fd47d5c03
5
5
  SHA512:
6
- metadata.gz: 9019b38648df1eb14df472991eaca4e8f349b6456a7c78d4b4458cd713e03d0ee61c8a24904dea1e60c8159de56c04799808f03b8c98c4d2089e480a1cba40a9
7
- data.tar.gz: 714686e496f584676dd25677287eec0fc63973dc87ea566a49685068ae1251660385cb6b0f2abf9d13459799766050ebcd3e2878248d24e0662794c1519df75e
6
+ metadata.gz: f90fde48cc106cad5d0337497114b58aa912cb35640c7ecbd91d807a0bdff233dd107d8e5a88f85a7fbc56b6a4c56d14ffdb87d9e48ebc8fe7684a5fa895540b
7
+ data.tar.gz: fc8051a49ac3b08896a6f03249de5ce2f8570389921ed88e473fa61454c500e057265af765254879eee276d0853fe292db014838f7967769c3e088829b0bd6ed
@@ -0,0 +1,55 @@
1
+ # This MetaParser returns a Meta object, which contains attributes the
2
+ # ParserSelector will check against to select the appropriate parser
3
+
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+
7
+ module Tychus
8
+ Meta = Struct.new(:uri_object, :open_graph_protocol, :schema_org_microformat) do
9
+ alias_method :schema_org_microformat?, :schema_org_microformat
10
+
11
+ def uri; uri_object.to_s; end
12
+ def host; uri_object.host; end
13
+ def open_graph_protocol?; open_graph_protocol.present?; end
14
+ end
15
+
16
+ class MetaParser
17
+ attr_reader :meta, :doc
18
+
19
+ def initialize(uri)
20
+ @uri = uri
21
+ @meta = Meta.new
22
+ @doc = Nokogiri::HTML(open(uri))
23
+ end
24
+
25
+ def parse
26
+ set_uri
27
+ set_open_graph_protocol
28
+ set_schema_org_microformat
29
+
30
+ meta
31
+ end
32
+
33
+ def set_open_graph_protocol
34
+ protocol = doc.css('html').first.attr('xmlns:og')
35
+
36
+ meta.__send__("open_graph_protocol=", protocol)
37
+ end
38
+
39
+ def set_schema_org_microformat
40
+ schema_org_property = '[itemtype="http://schema.org/Recipe"]'
41
+ nodeset = doc.css(schema_org_property)
42
+
43
+ meta.__send__("schema_org_microformat=", nodeset.present?)
44
+ end
45
+
46
+ def set_uri
47
+ r = URIResolver.new(@uri, doc)
48
+ uri_object = r.resolve_uri
49
+
50
+ meta.uri_object = uri_object
51
+ end
52
+ end
53
+
54
+ end
55
+
@@ -3,12 +3,13 @@ module Tychus
3
3
  PARSERS = [
4
4
  Tychus::Parsers::AllrecipesParser,
5
5
  Tychus::Parsers::FoodNetworkParser,
6
- Tychus::Parsers::KraftRecipesParser
6
+ Tychus::Parsers::KraftRecipesParser,
7
+ Tychus::Parsers::CampbellsKitchenParser
7
8
  ]
8
9
 
9
- def self.resolve_parser(host)
10
+ def self.resolve_parser(meta_object)
10
11
  PARSERS.detect do |parser|
11
- host =~ %r[#{parser.uri_host}]
12
+ meta_object.to_s =~ %r[#{parser.uri_host}]
12
13
  end
13
14
  end
14
15
 
@@ -0,0 +1,10 @@
1
+ module Tychus
2
+ module Parsers
3
+ class CampbellsKitchenParser < OpenGraphProtocolParser
4
+ def self.uri_host
5
+ "campbellskitchen.com"
6
+ end
7
+ end
8
+ end
9
+ end
10
+
@@ -0,0 +1,35 @@
1
+ # URIs using FB's open graph protocol store all recipe values
2
+ # within their <head>, as meta tags with 'property' attributes:
3
+ # * og:url
4
+ # * og:title
5
+ # * og:image
6
+
7
+ module Tychus
8
+ module Parsers
9
+ class OpenGraphProtocolParser
10
+ def initialize(uri)
11
+ @root_doc = 'head'
12
+ @recipe_doc = @doc.css(root_doc)
13
+ end
14
+
15
+ def parse_image
16
+ og_node_for(:image)
17
+ end
18
+
19
+ def parse_name
20
+ og_node_for(:title)
21
+ end
22
+
23
+ def parse_description
24
+ recipe_doc.css('meta[name="description"]').first.attr('content')
25
+ end
26
+
27
+ def og_node_for(property)
28
+ node = recipe_doc.css('meta[property=\"og:#{property}\"]').first
29
+ node.attr('content')
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+
@@ -27,25 +27,6 @@ module Parsers
27
27
  itemprop_node_for(:description).content
28
28
  end
29
29
 
30
- def parse_recipe_instructions
31
- # strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
32
- #
33
- # Allrecipes: <li><span>lorem ipsum</span></li>
34
- # FoodNetwork: <p>lorem ipsum</p>
35
- # reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
36
- reject_regex = /^(h.|div)$/
37
-
38
- itemprop_node_for(:recipeInstructions)
39
- .element_children
40
- .reject { |node| node.name =~ reject_regex }
41
- .map do |node|
42
- node.content
43
- .squeeze(" ")
44
- .rstrip
45
- .split("\r\n\s\r\n\s")
46
- end.flatten.reject(&:blank?)
47
- end
48
-
49
30
  def parse_cook_time
50
31
  # leverage iso8601
51
32
  parse_duration(itemprop_node_for(:cookTime))
@@ -89,6 +70,25 @@ module Parsers
89
70
  parse_duration(itemprop_node_for(:prepTime))
90
71
  end
91
72
 
73
+ def parse_recipe_instructions
74
+ # strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
75
+ #
76
+ # Allrecipes: <li><span>lorem ipsum</span></li>
77
+ # FoodNetwork: <p>lorem ipsum</p>
78
+ # reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
79
+ reject_regex = /^(h.|div)$/
80
+
81
+ itemprop_node_for(:recipeInstructions)
82
+ .element_children
83
+ .reject { |node| node.name =~ reject_regex }
84
+ .map do |node|
85
+ node.content
86
+ .squeeze(" ")
87
+ .rstrip
88
+ .split("\r\n\s\r\n\s")
89
+ end.flatten.reject(&:blank?)
90
+ end
91
+
92
92
  def parse_recipe_yield
93
93
  itemprop_node_for(:recipeYield).content
94
94
  end
@@ -1,8 +1,10 @@
1
1
  require_relative 'parsers/base'
2
2
  require_relative 'parsers/schema_org_parser'
3
+ require_relative 'parsers/open_graph_protocol_parser'
3
4
  require_relative 'parsers/allrecipes_parser'
4
5
  require_relative 'parsers/food_network_parser'
5
6
  require_relative 'parsers/kraft_recipes_parser'
7
+ require_relative 'parsers/campbells_kitchen_parser'
6
8
 
7
9
  module Tychus
8
10
  module Parsers
@@ -11,10 +11,10 @@ module Tychus
11
11
  class URIResolver
12
12
  attr_reader :doc, :schema_org_canonical_uri_property, :open_graph_canonical_uri_property
13
13
 
14
- def initialize(uri)
14
+ def initialize(uri, doc=nil)
15
15
  @schema_org_canonical_uri_property = 'link[rel="canonical"]'
16
16
  @open_graph_canonical_uri_property = 'meta[property="og:url"]'
17
- @doc = Nokogiri::HTML(open(uri))
17
+ @doc ||= Nokogiri::HTML(open(uri))
18
18
  end
19
19
 
20
20
  def resolve_uri
@@ -24,7 +24,7 @@ module Tychus
24
24
  canonical_uri(open_graph_canonical_uri_property).presence || \
25
25
  uri
26
26
 
27
- Addressable::URI.parse(full_uri).host
27
+ Addressable::URI.parse(full_uri)
28
28
  end
29
29
 
30
30
  def canonical_uri(property)
@@ -1,3 +1,3 @@
1
1
  module Tychus
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/tychus.rb CHANGED
@@ -2,11 +2,12 @@ require "tychus/version"
2
2
  require "tychus/parsers"
3
3
  require "tychus/parser_selector"
4
4
  require "tychus/uri_resolver"
5
+ require "tychus/meta_parser"
5
6
 
6
7
  module Tychus
7
8
  def self.parse(uri)
8
- host = URIResolver.new(uri).resolve_uri
9
- parser = ParserSelector.resolve_parser(host)
9
+ meta_object = MetaParser.new(uri).parse
10
+ parser = ParserSelector.resolve_parser(meta_object)
10
11
  parser.new(uri).parse
11
12
  end
12
13
  end