tychus 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +49 -0
- data/Rakefile +2 -0
- data/lib/tychus.rb +12 -0
- data/lib/tychus/parser_selector.rb +17 -0
- data/lib/tychus/parsers.rb +17 -0
- data/lib/tychus/parsers/allrecipes_parser.rb +18 -0
- data/lib/tychus/parsers/base.rb +157 -0
- data/lib/tychus/parsers/food_network_parser.rb +38 -0
- data/lib/tychus/parsers/kraft_recipes_parser.rb +55 -0
- data/lib/tychus/parsers/schema_org_parser.rb +41 -0
- data/lib/tychus/recipe.rb +0 -0
- data/lib/tychus/uri_parser.rb +40 -0
- data/lib/tychus/uri_resolver.rb +40 -0
- data/lib/tychus/utilities/url_parser.rb +0 -0
- data/lib/tychus/version.rb +3 -0
- data/questions.md +10 -0
- data/spec/fixtures/allrecipes.html +3003 -0
- data/spec/fixtures/campbellskitchen.html +2190 -0
- data/spec/fixtures/food_network_double_ingredients_group.html +3725 -0
- data/spec/fixtures/food_network_single_ingredients_group.html +4930 -0
- data/spec/fixtures/kraftrecipes.html +2722 -0
- data/spec/parser_selector.rb +14 -0
- data/spec/parsers/allrecipes_parser_spec.rb +44 -0
- data/spec/parsers/base_spec.rb +3 -0
- data/spec/parsers/food_network_parser_spec.rb +89 -0
- data/spec/parsers/kraft_recipes_parser_spec.rb +67 -0
- data/spec/parsers/schema_org_parser_spec.rb +17 -0
- data/spec/spec_helper.rb +89 -0
- data/spec/tychus_spec.rb +3 -0
- data/spec/uri_resolver_spec.rb +25 -0
- data/tychus.gemspec +29 -0
- metadata +233 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bb7646d41e612da74f91b9fd838925f632fe1621
|
4
|
+
data.tar.gz: 45b6939c694f5a9221bc3be47accc4cd966eae0f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 68ed88a988f294e3880b9ed7ad4ae9813fe89da7eac26979ef3a7ffb0209345a89d98fd011d17e513ec445a8529b631e4f25f50828be5c7b96664c8f45228f64
|
7
|
+
data.tar.gz: 7860d9a43959274dd1aadf56a1b67335a3900f3c4d74a872c49e8063a84df2704d889064447e0c090d6b32611bf9b0c35ec219677b826c22c2f1273ee7d91a4c
|
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.bundle
|
19
|
+
*.so
|
20
|
+
*.o
|
21
|
+
*.a
|
22
|
+
mkmf.log
|
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Wayne Yang
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Tychus
|
2
|
+
|
3
|
+
Recipe parser supporting microformats for:
|
4
|
+
|
5
|
+
* [Schema.org/Recipe](https://support.google.com/webmasters/answer/173379?hl=en)
|
6
|
+
|
7
|
+
Compatible with:
|
8
|
+
|
9
|
+
* [Allrecipes](http://allrecipes.com)
|
10
|
+
* [Food Network](http://www.foodnetwork.com)
|
11
|
+
* [Kraft Recipes](http://www.kraftrecipes.com)
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
Add this line to your application's Gemfile:
|
16
|
+
|
17
|
+
gem 'tychus'
|
18
|
+
|
19
|
+
And then execute:
|
20
|
+
|
21
|
+
$ bundle
|
22
|
+
|
23
|
+
Or install it yourself as:
|
24
|
+
|
25
|
+
$ gem install tychus
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
```
|
30
|
+
'require tychus'
|
31
|
+
recipe = Tychus.parse('http://allrecipes.com/Recipe/Chicken-Pot-Pie-IX/Detail.aspx?soid=recs_recipe_2')
|
32
|
+
|
33
|
+
recipe.name
|
34
|
+
=> "Chicken Pot Pie IX"
|
35
|
+
recipe.author
|
36
|
+
=> "Robbie Rice"
|
37
|
+
recipe.description
|
38
|
+
=> "\"A delicious chicken pie made from scratch with carrots, peas and celery.\""
|
39
|
+
recipe.ingredients
|
40
|
+
=> ["1 pound skinless, boneless chicken breast halves - cubed", "1 cup sliced carrots", "1 cup frozen green peas", "1/2 cup sliced celery", "1/3 cup butter", "1/3 cup chopped onion", "1/3 cup all-purpose flour", "1/2 teaspoon salt", "1/4 teaspoon black pepper", "1/4 teaspoon celery seed", "1 3/4 cups chicken broth", "2/3 cup milk", "2 (9 inch) unbaked pie crusts"]
|
41
|
+
```
|
42
|
+
|
43
|
+
## Contributing
|
44
|
+
|
45
|
+
1. Fork it ( https://github.com/[my-github-username]/tychus/fork )
|
46
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
47
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
48
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
49
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/lib/tychus.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require "tychus/version"
|
2
|
+
require "tychus/parsers"
|
3
|
+
require "tychus/parser_selector"
|
4
|
+
require "tychus/uri_resolver"
|
5
|
+
|
6
|
+
module Tychus
|
7
|
+
def self.parse(uri)
|
8
|
+
host = URIResolver.new(uri).resolve_uri
|
9
|
+
parser = ParserSelector.resolve_parser(host)
|
10
|
+
parser.new(uri).parse
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Tychus
|
2
|
+
class ParserSelector
|
3
|
+
PARSERS = [
|
4
|
+
Tychus::Parsers::AllrecipesParser,
|
5
|
+
Tychus::Parsers::FoodNetworkParser,
|
6
|
+
Tychus::Parsers::KraftRecipesParser
|
7
|
+
]
|
8
|
+
|
9
|
+
def self.resolve_parser(host)
|
10
|
+
PARSERS.detect do |parser|
|
11
|
+
host =~ %r[#{parser.uri_host}]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require_relative 'parsers/base'
|
2
|
+
require_relative 'parsers/schema_org_parser'
|
3
|
+
require_relative 'parsers/allrecipes_parser'
|
4
|
+
require_relative 'parsers/food_network_parser'
|
5
|
+
require_relative 'parsers/kraft_recipes_parser'
|
6
|
+
|
7
|
+
module Tychus
|
8
|
+
module Parsers
|
9
|
+
|
10
|
+
Recipe = Struct.new(*Base.recipe_attributes) do
|
11
|
+
alias_method :yield, :recipe_yield
|
12
|
+
alias_method :instructions, :recipe_instructions
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Tychus
|
2
|
+
module Parsers
|
3
|
+
|
4
|
+
# Allrecipes uses schema.org's recipe microformat
|
5
|
+
class AllrecipesParser < SchemaOrgParser
|
6
|
+
def self.uri_host
|
7
|
+
"allrecipes.com"
|
8
|
+
end
|
9
|
+
|
10
|
+
def clean_instructions(instructions)
|
11
|
+
#reject last "Kitchen Friendly View" element
|
12
|
+
instructions[0..-2]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,157 @@
|
|
1
|
+
require 'active_support/core_ext/object/blank.rb'
|
2
|
+
|
3
|
+
module Tychus
|
4
|
+
module Parsers
|
5
|
+
|
6
|
+
class Base
|
7
|
+
attr_reader :uri, :doc, :recipe_doc, :recipe
|
8
|
+
|
9
|
+
def self.recipe_attributes
|
10
|
+
# TODO: clear up these attributes. Are they used? Real example to
|
11
|
+
# verify?
|
12
|
+
# recipeType
|
13
|
+
# photo
|
14
|
+
# published
|
15
|
+
# summary
|
16
|
+
# review - see schema.org/Review
|
17
|
+
%i[
|
18
|
+
name
|
19
|
+
author
|
20
|
+
description
|
21
|
+
prep_time
|
22
|
+
cook_time
|
23
|
+
total_time
|
24
|
+
recipe_yield
|
25
|
+
ingredients
|
26
|
+
recipe_instructions
|
27
|
+
image
|
28
|
+
]
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize(uri)
|
32
|
+
@uri = uri
|
33
|
+
@recipe = Recipe.new
|
34
|
+
@doc = Nokogiri::HTML(open(uri))
|
35
|
+
@recipe_doc = @doc.css(self.class.root_doc)
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse
|
39
|
+
recipe_attributes.each do |attr|
|
40
|
+
property_value = __send__("parse_#{attr}")
|
41
|
+
recipe.__send__("#{attr}=", Value(property_value))
|
42
|
+
end
|
43
|
+
recipe
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_author
|
47
|
+
# is it always first?
|
48
|
+
itemprop_node_for(:author).content
|
49
|
+
end
|
50
|
+
|
51
|
+
def parse_description
|
52
|
+
# is it always first?
|
53
|
+
itemprop_node_for(:description).content
|
54
|
+
end
|
55
|
+
|
56
|
+
def parse_recipe_instructions
|
57
|
+
# strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
|
58
|
+
#
|
59
|
+
# Allrecipes: <li><span>lorem ipsum</span></li>
|
60
|
+
# FoodNetwork: <p>lorem ipsum</p>
|
61
|
+
# reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
|
62
|
+
reject_regex = /^(h.|div)$/
|
63
|
+
|
64
|
+
clean_instructions(itemprop_node_for(:recipeInstructions)
|
65
|
+
.element_children
|
66
|
+
.reject { |node| node.name =~ reject_regex }
|
67
|
+
.map do |node|
|
68
|
+
node.content
|
69
|
+
.squeeze(" ")
|
70
|
+
.rstrip
|
71
|
+
.split("\r\n\s\r\n\s")
|
72
|
+
end.flatten.reject(&:blank?))
|
73
|
+
end
|
74
|
+
|
75
|
+
def parse_name
|
76
|
+
# is it always first?
|
77
|
+
itemprop_node_for(:name).content
|
78
|
+
end
|
79
|
+
|
80
|
+
def parse_cook_time
|
81
|
+
# is it always first?
|
82
|
+
# leverage iso8601
|
83
|
+
parse_duration(itemprop_node_for(:cookTime))
|
84
|
+
end
|
85
|
+
|
86
|
+
def parse_image
|
87
|
+
# is it always first?
|
88
|
+
itemprop_node_for(:image).attr('src')
|
89
|
+
end
|
90
|
+
|
91
|
+
def parse_ingredients
|
92
|
+
# NOT FIRST
|
93
|
+
recipe_doc
|
94
|
+
.css('[itemprop="ingredients"]')
|
95
|
+
.map do |ingredient_node|
|
96
|
+
ingredient_node
|
97
|
+
.element_children
|
98
|
+
.map(&:content)
|
99
|
+
.join(" ")
|
100
|
+
end.reject(&:blank?)
|
101
|
+
end
|
102
|
+
|
103
|
+
def parse_prep_time
|
104
|
+
# is it always first?
|
105
|
+
# leverage iso8601
|
106
|
+
parse_duration(itemprop_node_for(:prepTime))
|
107
|
+
end
|
108
|
+
|
109
|
+
def parse_duration(node)
|
110
|
+
# Allrecipes - 'time' element
|
111
|
+
# Foodnetwork - 'meta' element (std according to
|
112
|
+
# Schema.org/Recipe)
|
113
|
+
case node.name
|
114
|
+
when "meta", "span"
|
115
|
+
node.attr('content')
|
116
|
+
when "time"
|
117
|
+
node.attr('datetime')
|
118
|
+
else
|
119
|
+
NullObject.new
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def parse_recipe_yield
|
124
|
+
# is it always first?
|
125
|
+
itemprop_node_for(:recipeYield).content
|
126
|
+
end
|
127
|
+
|
128
|
+
def parse_total_time
|
129
|
+
# is it always first?
|
130
|
+
# leverage iso8601
|
131
|
+
parse_duration(itemprop_node_for(:totalTime))
|
132
|
+
end
|
133
|
+
|
134
|
+
def recipe_attributes
|
135
|
+
self.class.recipe_attributes
|
136
|
+
end
|
137
|
+
|
138
|
+
def clean_instructions(obj)
|
139
|
+
obj
|
140
|
+
end
|
141
|
+
|
142
|
+
def Value(obj)
|
143
|
+
case obj
|
144
|
+
when NullObject then nil
|
145
|
+
else obj
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
class NullObject
|
151
|
+
def method_missing(*args, &block)
|
152
|
+
self
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Tychus
|
2
|
+
module Parsers
|
3
|
+
class FoodNetworkParser < SchemaOrgParser
|
4
|
+
def self.uri_host
|
5
|
+
"foodnetwork.com"
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse_author
|
9
|
+
# in the case of an author advertising her TV show
|
10
|
+
itemprop_node_for(:author)
|
11
|
+
.css('span')
|
12
|
+
.first
|
13
|
+
.content
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_description
|
17
|
+
# Foodnetwork does not use the description in its recipe body
|
18
|
+
# resort to opengraph to pull out description in head
|
19
|
+
# TODO: pull this func out for an opengraph parser?
|
20
|
+
@doc.css('meta[property="og:description"]').first.attr('content')
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse_ingredients
|
24
|
+
# NOT FIRST
|
25
|
+
recipe_doc
|
26
|
+
.css('[itemprop="ingredients"]')
|
27
|
+
.map { |node| node.content.lstrip.squeeze(" ").chomp }
|
28
|
+
end
|
29
|
+
|
30
|
+
def clean_instructions(obj)
|
31
|
+
#TODO: what is best pattern to share this behavior?
|
32
|
+
obj
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Tychus
|
2
|
+
module Parsers
|
3
|
+
|
4
|
+
class KraftRecipesParser < SchemaOrgParser
|
5
|
+
|
6
|
+
def self.uri_host
|
7
|
+
"kraftrecipes.com"
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse_description
|
11
|
+
# description can be found in .recipeDesc or meta tag in header
|
12
|
+
# TODO: pull out meta tag parsing into own methods/class?
|
13
|
+
doc.css('meta[name="description"]').first.attr('content')
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_name
|
17
|
+
# "\r\n\tSweet BBQ Chicken Kabobs\r\n\t"
|
18
|
+
result = super
|
19
|
+
result.gsub(/(\r|\n|\t)/,'')
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_recipe_instructions
|
23
|
+
itemprop_node_for(:recipeInstructions)
|
24
|
+
.element_children
|
25
|
+
.map do|x|
|
26
|
+
x.content
|
27
|
+
.squeeze(" ")
|
28
|
+
.rstrip
|
29
|
+
.split("\r\n\t")
|
30
|
+
.map{|x|x.gsub(/\t/,'')}
|
31
|
+
end.flatten.reject(&:blank?)
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse_ingredients
|
35
|
+
# NOT FIRST
|
36
|
+
# "1 lb.\r\n\t\t\t\t\t\t\t\t boneless skinless chicken breasts, cut into 1-1/2-inch pieces", "2 cups\r\n\t\t\t\t\t\t\t\t fresh pineapple chunks (1-1/2 inch)", "1 \r\n\t\t\t\t\t\t\t\t each red and green pepper, cut into 1-1/2-inch pieces", "1/2 cup\r\n\t\t\t\t\t\t\t\t KRAFT Original Barbecue Sauce", "3 Tbsp.\r\n\t\t\t\t\t\t\t\t frozen orange juice concentrate, thawed"
|
37
|
+
recipe_doc
|
38
|
+
.css('[itemprop="ingredients"]')
|
39
|
+
.map do|ingredient_node|
|
40
|
+
ingredient_node
|
41
|
+
.element_children
|
42
|
+
.map do |node| node.content
|
43
|
+
.lstrip
|
44
|
+
.rstrip
|
45
|
+
.squeeze(" ")
|
46
|
+
.gsub(/(\r|\n|\t)/,'')
|
47
|
+
end.join(" ")
|
48
|
+
end.reject(&:blank?)
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Tychus
|
2
|
+
module Parsers
|
3
|
+
|
4
|
+
class SchemaOrgParser < Base
|
5
|
+
|
6
|
+
def initialize(uri)
|
7
|
+
super
|
8
|
+
strip_review_microformat
|
9
|
+
strip_video_object_microformat
|
10
|
+
end
|
11
|
+
|
12
|
+
def strip_review_microformat
|
13
|
+
recipe_doc.css(self.class.review_doc).remove
|
14
|
+
end
|
15
|
+
|
16
|
+
def strip_video_object_microformat
|
17
|
+
recipe_doc.css(self.class.video_object_doc).remove
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def self.root_doc
|
22
|
+
'[itemtype="http://schema.org/Recipe"]'
|
23
|
+
end
|
24
|
+
|
25
|
+
def itemprop_node_for(property)
|
26
|
+
recipe_doc.css("[itemprop='#{property}']").first || NullObject.new
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.review_doc
|
30
|
+
'[itemtype="http://schema.org/Review"]'
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.video_object_doc
|
34
|
+
'[itemtype="http://www.schema.org/VideoObject"]'
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|