tychus 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/tychus/meta_parser.rb +55 -0
- data/lib/tychus/parser_selector.rb +4 -3
- data/lib/tychus/parsers/campbells_kitchen_parser.rb +10 -0
- data/lib/tychus/parsers/open_graph_protocol_parser.rb +35 -0
- data/lib/tychus/parsers/schema_org_parser.rb +19 -19
- data/lib/tychus/parsers.rb +2 -0
- data/lib/tychus/uri_resolver.rb +3 -3
- data/lib/tychus/version.rb +1 -1
- data/lib/tychus.rb +3 -2
- data/spec/fixtures/cassettes/allrecipes_1.yml +14403 -0
- data/spec/fixtures/cassettes/campbells_kitchen_1.yml +3364 -0
- data/spec/fixtures/cassettes/food_network_single_ingredients_group_1.yml +4884 -0
- data/spec/fixtures/cassettes/kraft_recipes_1.yml +3638 -0
- data/spec/fixtures/cassettes/meta_parser_og_protocol_uri.yml +6733 -0
- data/spec/fixtures/cassettes/meta_parser_schema_org_microformat_uri.yml +9605 -0
- data/spec/meta_parser.rb +36 -0
- data/spec/parsers/allrecipes_parser_spec.rb +7 -6
- data/spec/parsers/campbells_kitchen_parser_spec.rb +71 -0
- data/spec/parsers/food_network_parser_spec.rb +7 -4
- data/spec/parsers/kraft_recipes_parser_spec.rb +6 -2
- data/spec/parsers/schema_org_parser_spec.rb +6 -2
- data/spec/spec_helper.rb +1 -1
- data/spec/uri_resolver_spec.rb +48 -10
- metadata +20 -11
- data/spec/fixtures/allrecipes.html +0 -3003
- data/spec/fixtures/campbellskitchen.html +0 -2190
- data/spec/fixtures/food_network_double_ingredients_group.html +0 -3725
- data/spec/fixtures/food_network_single_ingredients_group.html +0 -4930
- data/spec/fixtures/kraftrecipes.html +0 -2722
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 67e310db920ab96a5b75531a2f33192ee9c2ffbe
|
4
|
+
data.tar.gz: c72aab160fbcad2c93d3336eb1e45b7fd47d5c03
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f90fde48cc106cad5d0337497114b58aa912cb35640c7ecbd91d807a0bdff233dd107d8e5a88f85a7fbc56b6a4c56d14ffdb87d9e48ebc8fe7684a5fa895540b
|
7
|
+
data.tar.gz: fc8051a49ac3b08896a6f03249de5ce2f8570389921ed88e473fa61454c500e057265af765254879eee276d0853fe292db014838f7967769c3e088829b0bd6ed
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# This MetaParser returns a Meta object, which contains attributes the
|
2
|
+
# ParserSelector will check against to select the appropriate parser
|
3
|
+
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
|
7
|
+
module Tychus
|
8
|
+
Meta = Struct.new(:uri_object, :open_graph_protocol, :schema_org_microformat) do
|
9
|
+
alias_method :schema_org_microformat?, :schema_org_microformat
|
10
|
+
|
11
|
+
def uri; uri_object.to_s; end
|
12
|
+
def host; uri_object.host; end
|
13
|
+
def open_graph_protocol?; open_graph_protocol.present?; end
|
14
|
+
end
|
15
|
+
|
16
|
+
class MetaParser
|
17
|
+
attr_reader :meta, :doc
|
18
|
+
|
19
|
+
def initialize(uri)
|
20
|
+
@uri = uri
|
21
|
+
@meta = Meta.new
|
22
|
+
@doc = Nokogiri::HTML(open(uri))
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse
|
26
|
+
set_uri
|
27
|
+
set_open_graph_protocol
|
28
|
+
set_schema_org_microformat
|
29
|
+
|
30
|
+
meta
|
31
|
+
end
|
32
|
+
|
33
|
+
def set_open_graph_protocol
|
34
|
+
protocol = doc.css('html').first.attr('xmlns:og')
|
35
|
+
|
36
|
+
meta.__send__("open_graph_protocol=", protocol)
|
37
|
+
end
|
38
|
+
|
39
|
+
def set_schema_org_microformat
|
40
|
+
schema_org_property = '[itemtype="http://schema.org/Recipe"]'
|
41
|
+
nodeset = doc.css(schema_org_property)
|
42
|
+
|
43
|
+
meta.__send__("schema_org_microformat=", nodeset.present?)
|
44
|
+
end
|
45
|
+
|
46
|
+
def set_uri
|
47
|
+
r = URIResolver.new(@uri, doc)
|
48
|
+
uri_object = r.resolve_uri
|
49
|
+
|
50
|
+
meta.uri_object = uri_object
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
@@ -3,12 +3,13 @@ module Tychus
|
|
3
3
|
PARSERS = [
|
4
4
|
Tychus::Parsers::AllrecipesParser,
|
5
5
|
Tychus::Parsers::FoodNetworkParser,
|
6
|
-
Tychus::Parsers::KraftRecipesParser
|
6
|
+
Tychus::Parsers::KraftRecipesParser,
|
7
|
+
Tychus::Parsers::CampbellsKitchenParser
|
7
8
|
]
|
8
9
|
|
9
|
-
def self.resolve_parser(
|
10
|
+
def self.resolve_parser(meta_object)
|
10
11
|
PARSERS.detect do |parser|
|
11
|
-
|
12
|
+
meta_object.to_s =~ %r[#{parser.uri_host}]
|
12
13
|
end
|
13
14
|
end
|
14
15
|
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# URIs using FB's open graph protocol store all recipe values
|
2
|
+
# within their <head>, as meta tags with 'property' attributes:
|
3
|
+
# * og:url
|
4
|
+
# * og:title
|
5
|
+
# * og:image
|
6
|
+
|
7
|
+
module Tychus
|
8
|
+
module Parsers
|
9
|
+
class OpenGraphProtocolParser
|
10
|
+
def initialize(uri)
|
11
|
+
@root_doc = 'head'
|
12
|
+
@recipe_doc = @doc.css(root_doc)
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse_image
|
16
|
+
og_node_for(:image)
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse_name
|
20
|
+
og_node_for(:title)
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse_description
|
24
|
+
recipe_doc.css('meta[name="description"]').first.attr('content')
|
25
|
+
end
|
26
|
+
|
27
|
+
def og_node_for(property)
|
28
|
+
node = recipe_doc.css('meta[property=\"og:#{property}\"]').first
|
29
|
+
node.attr('content')
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
@@ -27,25 +27,6 @@ module Parsers
|
|
27
27
|
itemprop_node_for(:description).content
|
28
28
|
end
|
29
29
|
|
30
|
-
def parse_recipe_instructions
|
31
|
-
# strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
|
32
|
-
#
|
33
|
-
# Allrecipes: <li><span>lorem ipsum</span></li>
|
34
|
-
# FoodNetwork: <p>lorem ipsum</p>
|
35
|
-
# reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
|
36
|
-
reject_regex = /^(h.|div)$/
|
37
|
-
|
38
|
-
itemprop_node_for(:recipeInstructions)
|
39
|
-
.element_children
|
40
|
-
.reject { |node| node.name =~ reject_regex }
|
41
|
-
.map do |node|
|
42
|
-
node.content
|
43
|
-
.squeeze(" ")
|
44
|
-
.rstrip
|
45
|
-
.split("\r\n\s\r\n\s")
|
46
|
-
end.flatten.reject(&:blank?)
|
47
|
-
end
|
48
|
-
|
49
30
|
def parse_cook_time
|
50
31
|
# leverage iso8601
|
51
32
|
parse_duration(itemprop_node_for(:cookTime))
|
@@ -89,6 +70,25 @@ module Parsers
|
|
89
70
|
parse_duration(itemprop_node_for(:prepTime))
|
90
71
|
end
|
91
72
|
|
73
|
+
def parse_recipe_instructions
|
74
|
+
# strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
|
75
|
+
#
|
76
|
+
# Allrecipes: <li><span>lorem ipsum</span></li>
|
77
|
+
# FoodNetwork: <p>lorem ipsum</p>
|
78
|
+
# reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
|
79
|
+
reject_regex = /^(h.|div)$/
|
80
|
+
|
81
|
+
itemprop_node_for(:recipeInstructions)
|
82
|
+
.element_children
|
83
|
+
.reject { |node| node.name =~ reject_regex }
|
84
|
+
.map do |node|
|
85
|
+
node.content
|
86
|
+
.squeeze(" ")
|
87
|
+
.rstrip
|
88
|
+
.split("\r\n\s\r\n\s")
|
89
|
+
end.flatten.reject(&:blank?)
|
90
|
+
end
|
91
|
+
|
92
92
|
def parse_recipe_yield
|
93
93
|
itemprop_node_for(:recipeYield).content
|
94
94
|
end
|
data/lib/tychus/parsers.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
require_relative 'parsers/base'
|
2
2
|
require_relative 'parsers/schema_org_parser'
|
3
|
+
require_relative 'parsers/open_graph_protocol_parser'
|
3
4
|
require_relative 'parsers/allrecipes_parser'
|
4
5
|
require_relative 'parsers/food_network_parser'
|
5
6
|
require_relative 'parsers/kraft_recipes_parser'
|
7
|
+
require_relative 'parsers/campbells_kitchen_parser'
|
6
8
|
|
7
9
|
module Tychus
|
8
10
|
module Parsers
|
data/lib/tychus/uri_resolver.rb
CHANGED
@@ -11,10 +11,10 @@ module Tychus
|
|
11
11
|
class URIResolver
|
12
12
|
attr_reader :doc, :schema_org_canonical_uri_property, :open_graph_canonical_uri_property
|
13
13
|
|
14
|
-
def initialize(uri)
|
14
|
+
def initialize(uri, doc=nil)
|
15
15
|
@schema_org_canonical_uri_property = 'link[rel="canonical"]'
|
16
16
|
@open_graph_canonical_uri_property = 'meta[property="og:url"]'
|
17
|
-
@doc
|
17
|
+
@doc ||= Nokogiri::HTML(open(uri))
|
18
18
|
end
|
19
19
|
|
20
20
|
def resolve_uri
|
@@ -24,7 +24,7 @@ module Tychus
|
|
24
24
|
canonical_uri(open_graph_canonical_uri_property).presence || \
|
25
25
|
uri
|
26
26
|
|
27
|
-
Addressable::URI.parse(full_uri)
|
27
|
+
Addressable::URI.parse(full_uri)
|
28
28
|
end
|
29
29
|
|
30
30
|
def canonical_uri(property)
|
data/lib/tychus/version.rb
CHANGED
data/lib/tychus.rb
CHANGED
@@ -2,11 +2,12 @@ require "tychus/version"
|
|
2
2
|
require "tychus/parsers"
|
3
3
|
require "tychus/parser_selector"
|
4
4
|
require "tychus/uri_resolver"
|
5
|
+
require "tychus/meta_parser"
|
5
6
|
|
6
7
|
module Tychus
|
7
8
|
def self.parse(uri)
|
8
|
-
|
9
|
-
parser = ParserSelector.resolve_parser(
|
9
|
+
meta_object = MetaParser.new(uri).parse
|
10
|
+
parser = ParserSelector.resolve_parser(meta_object)
|
10
11
|
parser.new(uri).parse
|
11
12
|
end
|
12
13
|
end
|