open_graph_reader 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitmodules +3 -0
  3. data/.rspec +2 -0
  4. data/.yardopts +1 -0
  5. data/lib/open_graph_reader.rb +83 -0
  6. data/lib/open_graph_reader/base.rb +57 -0
  7. data/lib/open_graph_reader/builder.rb +100 -0
  8. data/lib/open_graph_reader/definitions.rb +333 -0
  9. data/lib/open_graph_reader/fetcher.rb +82 -0
  10. data/lib/open_graph_reader/object.rb +95 -0
  11. data/lib/open_graph_reader/object/dsl.rb +130 -0
  12. data/lib/open_graph_reader/object/dsl/types.rb +71 -0
  13. data/lib/open_graph_reader/object/registry.rb +54 -0
  14. data/lib/open_graph_reader/parser.rb +85 -0
  15. data/lib/open_graph_reader/parser/graph.rb +136 -0
  16. data/lib/open_graph_reader/version.rb +4 -0
  17. data/spec/fixtures/examples/apple-touch-icon-precomposed.png +0 -0
  18. data/spec/fixtures/examples/apple-touch-icon.png +0 -0
  19. data/spec/fixtures/examples/article-offset.html +25 -0
  20. data/spec/fixtures/examples/article-utc.html +25 -0
  21. data/spec/fixtures/examples/article.html +25 -0
  22. data/spec/fixtures/examples/audio-array.html +27 -0
  23. data/spec/fixtures/examples/audio-url.html +25 -0
  24. data/spec/fixtures/examples/audio.html +24 -0
  25. data/spec/fixtures/examples/book-isbn10.html +27 -0
  26. data/spec/fixtures/examples/book.html +27 -0
  27. data/spec/fixtures/examples/canadian.html +16 -0
  28. data/spec/fixtures/examples/error.html +17 -0
  29. data/spec/fixtures/examples/errors/article-date.html +25 -0
  30. data/spec/fixtures/examples/errors/book-author.html +27 -0
  31. data/spec/fixtures/examples/errors/book.html +27 -0
  32. data/spec/fixtures/examples/errors/gender.html +20 -0
  33. data/spec/fixtures/examples/errors/geo.html +23 -0
  34. data/spec/fixtures/examples/errors/type.html +16 -0
  35. data/spec/fixtures/examples/errors/video-duration.html +42 -0
  36. data/spec/fixtures/examples/favicon.ico +0 -0
  37. data/spec/fixtures/examples/filters/xss-image.html +15 -0
  38. data/spec/fixtures/examples/image-array.html +26 -0
  39. data/spec/fixtures/examples/image-toosmall.html +24 -0
  40. data/spec/fixtures/examples/image-url.html +22 -0
  41. data/spec/fixtures/examples/image.html +21 -0
  42. data/spec/fixtures/examples/index.html +67 -0
  43. data/spec/fixtures/examples/media/audio/1khz.mp3 +0 -0
  44. data/spec/fixtures/examples/media/audio/250hz.mp3 +0 -0
  45. data/spec/fixtures/examples/media/images/1.png +0 -0
  46. data/spec/fixtures/examples/media/images/50.png +0 -0
  47. data/spec/fixtures/examples/media/images/75.png +0 -0
  48. data/spec/fixtures/examples/media/images/icon.png +0 -0
  49. data/spec/fixtures/examples/media/images/logo.png +0 -0
  50. data/spec/fixtures/examples/media/images/train.jpg +0 -0
  51. data/spec/fixtures/examples/media/video/train.flv +0 -0
  52. data/spec/fixtures/examples/media/video/train.mp4 +0 -0
  53. data/spec/fixtures/examples/media/video/train.webm +0 -0
  54. data/spec/fixtures/examples/min.html +14 -0
  55. data/spec/fixtures/examples/nomedia.html +20 -0
  56. data/spec/fixtures/examples/plain.html +10 -0
  57. data/spec/fixtures/examples/profile.html +25 -0
  58. data/spec/fixtures/examples/required.html +20 -0
  59. data/spec/fixtures/examples/robots.txt +4 -0
  60. data/spec/fixtures/examples/sitemap.xml +23 -0
  61. data/spec/fixtures/examples/video-array.html +36 -0
  62. data/spec/fixtures/examples/video-movie.html +42 -0
  63. data/spec/fixtures/examples/video.html +26 -0
  64. data/spec/integration/invalid_examples_spec.rb +69 -0
  65. data/spec/integration/valid_examples_spec.rb +76 -0
  66. data/spec/open_graph_reader_spec.rb +94 -0
  67. data/spec/spec_helper.rb +35 -0
  68. metadata +247 -0
@@ -0,0 +1,82 @@
1
+ require 'faraday'
2
+
3
+ module OpenGraphReader
4
+ # Fetch an URI to retrieve its HTML body, if available.
5
+ #
6
+ # @api private
7
+ class Fetcher
8
+ # Create a new fetcher.
9
+ #
10
+ # @param [URI] uri the URI to fetch.
11
+ def initialize uri
12
+ raise ArgumentError, "url needs to be an instance of URI" unless uri.is_a? URI
13
+ @uri = uri
14
+ @connection = Faraday.default_connection.dup
15
+
16
+ if defined? FaradayMiddleware
17
+ unless @connection.builder.handlers.include? FaradayMiddleware::FollowRedirects
18
+ @connection.builder.insert(0, FaradayMiddleware::FollowRedirects)
19
+ end
20
+ end
21
+ end
22
+
23
+ # The URL to fetch
24
+ #
25
+ # @return [String]
26
+ def url
27
+ @uri.to_s
28
+ end
29
+
30
+ # Fetch the full page.
31
+ #
32
+ # @return [Faraday::Response]
33
+ def fetch
34
+ @get_response = @connection.get(@uri)
35
+ end
36
+ alias_method :fetch_body, :fetch
37
+
38
+ # Fetch just the headers
39
+ #
40
+ # @return [Faraday::Response]
41
+ def fetch_headers
42
+ @head_response = @connection.head(@uri)
43
+ end
44
+
45
+ # Retrieve the body
46
+ #
47
+ # @todo Custom error class
48
+ # @raise [ArgumentError] The received content does not seems to be HTML.
49
+ # @return [String]
50
+ def body
51
+ fetch_body unless fetched?
52
+ raise ArgumentError, "Did not receive a HTML site at #{@uri}" unless html?
53
+ @get_response.body
54
+ end
55
+
56
+ # Whether the target URI seems to return HTML
57
+ #
58
+ # @return [Bool]
59
+ def html?
60
+ fetch_headers unless fetched_headers?
61
+ response = @get_response || @head_response
62
+ return false unless response.success?
63
+ return false unless response['content-type']
64
+ response['content-type'].include? 'text/html'
65
+ end
66
+
67
+ # Whether the target URI was fetched.
68
+ #
69
+ # @return [Bool]
70
+ def fetched?
71
+ !@get_response.nil?
72
+ end
73
+ alias_method :fetched_body?, :fetched?
74
+
75
+ # Whether the headers of the target URI were fetched.
76
+ #
77
+ # @return [Bool]
78
+ def fetched_headers?
79
+ !@get_response.nil? || !@head_response.nil?
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,95 @@
1
+ require 'open_graph_reader/object/registry'
2
+ require 'open_graph_reader/object/dsl'
3
+ require 'open_graph_reader/object/dsl/types'
4
+
5
+ module OpenGraphReader
6
+
7
+ # This module provides the base functionality for all OpenGraph objects
8
+ # and makes the {DSL} methods for describing them available when included.
9
+ #
10
+ # @example Define a new object
11
+ # class MyObject
12
+ # include OpenGraphReader::Object
13
+ #
14
+ # namespace :my, :object
15
+ # content :string
16
+ # string :name, required: true
17
+ # end
18
+ module Object
19
+ # @private
20
+ def self.included base
21
+ base.extend DSL
22
+ end
23
+
24
+ # If the namespace this object represents had a value, it is available here
25
+ # @return [String, nil]
26
+ attr_reader :content
27
+
28
+ # Regular properties on this object
29
+ #
30
+ # @api private
31
+ # @return [{String => String, Object}]
32
+ attr_reader :properties
33
+
34
+ # Properties on this object that are arrays.
35
+ #
36
+ # @api private
37
+ # @return [{String => Array<String, Object>}]
38
+ attr_reader :children
39
+
40
+
41
+ # Create a new object. If your class overrides this don't forget to call <tt>super</tt>.
42
+ def initialize
43
+ @properties = {}
44
+ @children = Hash.new {|h, k| h[k] = [] }
45
+ end
46
+
47
+ # Whether this object has the given property
48
+ #
49
+ # @param [#to_s] name
50
+ # @return [Bool]
51
+ def has_property? name
52
+ self.class.available_properties.include? name.to_s
53
+ end
54
+
55
+ # Set the content for this object in case it is also a property on
56
+ # another object. If a processor is defined, it will be called.
57
+ #
58
+ # @api private
59
+ # @param [String] value
60
+ def content= value
61
+ value = self.class.content_processor.call(value)
62
+ @content = value
63
+ end
64
+
65
+ # Get a property on this object.
66
+ #
67
+ # @api private
68
+ # @param [#to_s] name
69
+ # @todo right error?
70
+ # @raise [InvalidObjectError] If the requested property is undefined.
71
+ # @return [String, Object]
72
+ def [] name
73
+ raise InvalidObjectError, "Undefined property #{name} on #{inspect}" unless has_property? name
74
+ properties[name.to_s]
75
+ end
76
+
77
+ # Set the property to the given value.
78
+ #
79
+ # @api private
80
+ # @param [#to_s] name
81
+ # @param [String, Object] value
82
+ # @raise [InvalidObjectError] If the requested property is undefined.
83
+ def []= name, value
84
+ raise InvalidObjectError, "Undefined property #{name} on #{inspect}" unless has_property? name
85
+ public_send "#{name}=", value
86
+ end
87
+
88
+ # Returns {#content} if available.
89
+ #
90
+ # @return [String]
91
+ def to_s
92
+ content || super
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,130 @@
1
+ require 'open_graph_reader/object/registry'
2
+
3
+ module OpenGraphReader
4
+ module Object
5
+ # This module provides the methods to define new types and properties,
6
+ # as well as setting other metadata necessary to describe an object, such
7
+ # as its namespace.
8
+ module DSL
9
+ # @!macro define_type_description
10
+ # @param [Symbol] name the name of the property in the current namespace
11
+ # @param [{Symbol => Bool, Class, Array<String>}] options additional options
12
+ # @option options [Bool] :required (false) Make the property required.
13
+ # @option options [Bool] :collection (false) This property can occur multiple times.
14
+ # @option options [Class] :to This property maps to the given object (optional).
15
+ # belongs to the given verticals of the object (optional).
16
+ # @option options [Array<String>] :verticials This property
17
+ #
18
+ # @!macro property
19
+ # @!attribute [rw] $1
20
+
21
+ # @!macro [attach] define_type
22
+ # @!method $1(name, options={})
23
+ # @!macro define_type_description
24
+ #
25
+ # Defines a new DSL method for modeling a new type
26
+ #
27
+ # @yield convert and validate
28
+ # @yieldparam [::Object] value the value to be converted and validated
29
+ # @yieldparam [Array<::Object>] *args any additional arguments
30
+ # @yieldparam [{Symbol => Bool, Class, Array<String>}] options the options hash as last parameter
31
+ def self.define_type(name, &processor)
32
+ processors[name] = processor
33
+
34
+ define_method(name) do |name, *args|
35
+ available_properties << name.to_s
36
+ options = args.pop if args.last.is_a? Hash
37
+ options ||= {}
38
+
39
+ Registry.register [@namespace, name].join(':'), options[:to] if options[:to]
40
+
41
+ if options[:verticals]
42
+ options[:verticals].each do |vertical|
43
+ verticals[[@namespace, vertical].join('.')] << name
44
+ end
45
+ end
46
+
47
+ if options[:collection]
48
+ define_method("#{name}s") do
49
+ children[name.to_s]
50
+ end
51
+
52
+ define_method(name) do
53
+ # TODO raise if required
54
+ value = children[name.to_s].first
55
+ # TODO: figure out a sane way to distinguish subobject properties
56
+ value.content if value && value.is_a?(Object)
57
+ value || options[:default]
58
+ end
59
+ else
60
+ define_method(name) do
61
+ # TODO raise if required
62
+ properties[name.to_s] || options[:default]
63
+ end
64
+
65
+ define_method("#{name}=") do |value|
66
+ # TODO: figure out a sane way to distinguish subobject properties
67
+ value = processor.call(value, *args, options) unless value.is_a? Object
68
+ properties[name.to_s] = value
69
+ end
70
+ end
71
+ end
72
+ end
73
+ singleton_class.send(:alias_method, :define_type_with_args, :define_type)
74
+
75
+ # @overload namespace
76
+ # Get the namespace of this object.
77
+ #
78
+ # @return [String] A colon separated namespace, for example <tt>og:image</tt>.
79
+ # @overload namespace(*names)
80
+ # Set the namespace of this object.
81
+ #
82
+ # @param [Array<#to_s>] *names The individual parts of the namespace as list
83
+ # @example
84
+ # namespace :og, :image
85
+ def namespace *names
86
+ return @namespace if names.empty?
87
+ @namespace = names.join(':')
88
+ Registry.register @namespace, self
89
+ end
90
+
91
+ # Set the type for the content attribute
92
+ #
93
+ # @param [Symbol] type one of the registered types.
94
+ def content type
95
+ @content_processor = DSL.processors[type]
96
+ end
97
+
98
+ # The list of defined properties on this object.
99
+ #
100
+ # @return [Array<String>]
101
+ def available_properties
102
+ @available_properties ||= []
103
+ end
104
+
105
+ # A map from type names to processing blocks.
106
+ #
107
+ # @api private
108
+ # @return [{Symbol => Proc}]
109
+ def self.processors
110
+ @processors ||= {}
111
+ end
112
+
113
+ # The processor for the content attribute.
114
+ #
115
+ # @api private
116
+ # @return [Proc]
117
+ def content_processor
118
+ @content_processor || proc {|value| value }
119
+ end
120
+
121
+ # A map from vertical names to attributes that belong to them.
122
+ #
123
+ # @api private
124
+ # @return [{String => Array<Strin>}]
125
+ def verticals
126
+ @verticals ||= Hash.new {|h, k| h[k] = [] }
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,71 @@
1
+ require 'date'
2
+
3
+ require 'open_graph_reader/object/dsl'
4
+
5
+ module OpenGraphReader
6
+ module Object
7
+ module DSL
8
+ # @see http://ogp.me/#string
9
+ define_type :string do |value|
10
+ value.to_s
11
+ end
12
+
13
+ # @see http://ogp.me/#url
14
+ define_type :url do |value|
15
+ value.to_s.tap {|value|
16
+ unless value.start_with?('http://') || value.start_with?('https://')
17
+ raise InvalidObjectError, "URL #{value.inspect} does not start with http:// or https://"
18
+ end
19
+ }
20
+ end
21
+
22
+ # @!method enum(name, allowed, options={})
23
+ # @param [Array<String>] allowed the list of allowed values
24
+ # @!macro define_type_description
25
+ # @see http://ogp.me/#enum
26
+ define_type_with_args :enum do |value, allowed|
27
+ unless allowed.include? value
28
+ raise InvalidObjectError, "Expected one of #{allowed.inspect} but was #{value.inspect}"
29
+ end
30
+
31
+ value.to_s
32
+ end
33
+
34
+ # @see http://ogp.me/#integer
35
+ define_type :integer do |value|
36
+ begin
37
+ Integer(value)
38
+ rescue ArgumentError => e
39
+ raise InvalidObjectError, "Integer expected, but was #{value.inspect}"
40
+ end
41
+ end
42
+
43
+ # @see http://ogp.me/#datetime
44
+ define_type :datetime do |value|
45
+ begin
46
+ DateTime.iso8601 value
47
+ rescue ArgumentError => e
48
+ raise InvalidObjectError, "ISO8601 datetime expected, but was #{value.inspect}"
49
+ end
50
+ end
51
+
52
+ # @see http://ogp.me/#bool
53
+ define_type :boolean do |value|
54
+ {'true' => true, 'false' => false, '1' => true, '0' => false}[value].tap {|bool|
55
+ if bool.nil?
56
+ raise InvalidObjectError, "Boolean expected, but was #{value.inspect}"
57
+ end
58
+ }
59
+ end
60
+
61
+ # @see http://ogp.me/#float
62
+ define_type :float do |value|
63
+ begin
64
+ Float(value)
65
+ rescue ArgumentError => e
66
+ raise InvalidObjectError, "Float expected, but was #{value.inspect}"
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,54 @@
1
+ require 'singleton'
2
+ require 'forwardable'
3
+
4
+ module OpenGraphReader
5
+ module Object
6
+ # Global registry of namespaces and their representing classes.
7
+ #
8
+ # @api private
9
+ class Registry
10
+ extend Forwardable
11
+ include Singleton
12
+
13
+ class << self
14
+ extend Forwardable
15
+ # @!method register(namespace, klass)
16
+ # Register a new namespace in the registry.
17
+ #
18
+ # @param [String] namespace The namespace in colon separated form, for example <tt>og:image</tt>.
19
+ # @param [Class] klass The class to register. It should include {Object}.
20
+ # @api private
21
+ #
22
+ # @!method registered?(namespace)
23
+ # Check whether a namespace is registered.
24
+ #
25
+ # @param [String] namespace The namespace in colon separated form, for example <tt>og:image</tt>.
26
+ # @return [Bool]
27
+ # @api private
28
+ #
29
+ # @!method [](namespace)
30
+ # Fetch the class associated with the given namespace
31
+ #
32
+ # @param [String] namespace The namespace in colon separated form, for example <tt>og:image</tt>.
33
+ # @return [Class] The matching class.
34
+ # @raise [ArgumentError] If the given namespace wasn't registered.
35
+ # @api private
36
+ def_delegators :instance, :register, :registered?, :[]
37
+ end
38
+
39
+ def_delegators :@namespaces, :[]=, :has_key?
40
+ alias_method :register, :[]=
41
+ alias_method :registered?, :has_key?
42
+
43
+ def initialize
44
+ @namespaces = {}
45
+ end
46
+
47
+ # @see Registry.[]
48
+ def [] namespace
49
+ raise ArgumentError, "#{namespace} is not a registered namespace" unless registered? namespace
50
+ @namespaces[namespace]
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,85 @@
1
+ require 'nokogiri'
2
+
3
+ require 'open_graph_reader/parser/graph'
4
+
5
+ module OpenGraphReader
6
+ # Parse OpenGraph tags in a HTML document into a graph.
7
+ #
8
+ # @api private
9
+ class Parser
10
+ # Namespaces found in the passed documents head tag
11
+ #
12
+ # @return [Array<String>]
13
+ attr_reader :additional_namespaces
14
+
15
+ # Create a new parser.
16
+ #
17
+ # @param [#to_s, Nokogiri::XML::Node] html the document to parse.
18
+ # @param [String] origin The source the document was obtained from.
19
+ def initialize html, origin=nil
20
+ @doc = to_doc html
21
+ @origin = origin
22
+ @additional_namespaces = []
23
+ end
24
+
25
+ # Whether there are any OpenGraph tags at all.
26
+ #
27
+ # @return [Bool]
28
+ def has_tags?
29
+ !graph.empty?
30
+ end
31
+
32
+ # Build and return the {Graph}.
33
+ #
34
+ # @return [Graph]
35
+ def graph
36
+ @graph ||= build_graph
37
+ end
38
+
39
+ private
40
+
41
+ def build_graph
42
+ graph = Graph.new
43
+ head = @doc.xpath('/html/head').first
44
+
45
+ raise NoOpenGraphDataError, "There's no head tag in #{@doc}" unless head
46
+
47
+ condition = "starts-with(@property, 'og:')"
48
+ if head['prefix']
49
+ @additional_namespaces = head['prefix'].scan(/(\w+):\s*([^ ]+)/).map(&:first)
50
+ @additional_namespaces.each do |additional_namespace|
51
+ next if additional_namespace == 'og'
52
+ condition << " or starts-with(@property, '#{additional_namespace}')"
53
+ end
54
+ end
55
+
56
+ head.xpath("meta[#{condition}]").each do |tag|
57
+ *path, leaf = tag['property'].split(':')
58
+ node = path.inject(graph.root) {|node, name|
59
+ child = node.children.reverse.find {|child| child.name == name }
60
+
61
+ unless child
62
+ child = Graph::Node.new name
63
+ node << child
64
+ end
65
+
66
+ child
67
+ }
68
+
69
+ # TODO: make stripping configurable?
70
+ node << Graph::Node.new(leaf, tag['content'].strip)
71
+ end
72
+
73
+ graph
74
+ end
75
+
76
+ def to_doc html
77
+ case html
78
+ when Nokogiri::XML::Node
79
+ html
80
+ else
81
+ Nokogiri::HTML.parse(html.to_s)
82
+ end
83
+ end
84
+ end
85
+ end