open_graph_reader 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitmodules +3 -0
  3. data/.rspec +2 -0
  4. data/.yardopts +1 -0
  5. data/lib/open_graph_reader.rb +83 -0
  6. data/lib/open_graph_reader/base.rb +57 -0
  7. data/lib/open_graph_reader/builder.rb +100 -0
  8. data/lib/open_graph_reader/definitions.rb +333 -0
  9. data/lib/open_graph_reader/fetcher.rb +82 -0
  10. data/lib/open_graph_reader/object.rb +95 -0
  11. data/lib/open_graph_reader/object/dsl.rb +130 -0
  12. data/lib/open_graph_reader/object/dsl/types.rb +71 -0
  13. data/lib/open_graph_reader/object/registry.rb +54 -0
  14. data/lib/open_graph_reader/parser.rb +85 -0
  15. data/lib/open_graph_reader/parser/graph.rb +136 -0
  16. data/lib/open_graph_reader/version.rb +4 -0
  17. data/spec/fixtures/examples/apple-touch-icon-precomposed.png +0 -0
  18. data/spec/fixtures/examples/apple-touch-icon.png +0 -0
  19. data/spec/fixtures/examples/article-offset.html +25 -0
  20. data/spec/fixtures/examples/article-utc.html +25 -0
  21. data/spec/fixtures/examples/article.html +25 -0
  22. data/spec/fixtures/examples/audio-array.html +27 -0
  23. data/spec/fixtures/examples/audio-url.html +25 -0
  24. data/spec/fixtures/examples/audio.html +24 -0
  25. data/spec/fixtures/examples/book-isbn10.html +27 -0
  26. data/spec/fixtures/examples/book.html +27 -0
  27. data/spec/fixtures/examples/canadian.html +16 -0
  28. data/spec/fixtures/examples/error.html +17 -0
  29. data/spec/fixtures/examples/errors/article-date.html +25 -0
  30. data/spec/fixtures/examples/errors/book-author.html +27 -0
  31. data/spec/fixtures/examples/errors/book.html +27 -0
  32. data/spec/fixtures/examples/errors/gender.html +20 -0
  33. data/spec/fixtures/examples/errors/geo.html +23 -0
  34. data/spec/fixtures/examples/errors/type.html +16 -0
  35. data/spec/fixtures/examples/errors/video-duration.html +42 -0
  36. data/spec/fixtures/examples/favicon.ico +0 -0
  37. data/spec/fixtures/examples/filters/xss-image.html +15 -0
  38. data/spec/fixtures/examples/image-array.html +26 -0
  39. data/spec/fixtures/examples/image-toosmall.html +24 -0
  40. data/spec/fixtures/examples/image-url.html +22 -0
  41. data/spec/fixtures/examples/image.html +21 -0
  42. data/spec/fixtures/examples/index.html +67 -0
  43. data/spec/fixtures/examples/media/audio/1khz.mp3 +0 -0
  44. data/spec/fixtures/examples/media/audio/250hz.mp3 +0 -0
  45. data/spec/fixtures/examples/media/images/1.png +0 -0
  46. data/spec/fixtures/examples/media/images/50.png +0 -0
  47. data/spec/fixtures/examples/media/images/75.png +0 -0
  48. data/spec/fixtures/examples/media/images/icon.png +0 -0
  49. data/spec/fixtures/examples/media/images/logo.png +0 -0
  50. data/spec/fixtures/examples/media/images/train.jpg +0 -0
  51. data/spec/fixtures/examples/media/video/train.flv +0 -0
  52. data/spec/fixtures/examples/media/video/train.mp4 +0 -0
  53. data/spec/fixtures/examples/media/video/train.webm +0 -0
  54. data/spec/fixtures/examples/min.html +14 -0
  55. data/spec/fixtures/examples/nomedia.html +20 -0
  56. data/spec/fixtures/examples/plain.html +10 -0
  57. data/spec/fixtures/examples/profile.html +25 -0
  58. data/spec/fixtures/examples/required.html +20 -0
  59. data/spec/fixtures/examples/robots.txt +4 -0
  60. data/spec/fixtures/examples/sitemap.xml +23 -0
  61. data/spec/fixtures/examples/video-array.html +36 -0
  62. data/spec/fixtures/examples/video-movie.html +42 -0
  63. data/spec/fixtures/examples/video.html +26 -0
  64. data/spec/integration/invalid_examples_spec.rb +69 -0
  65. data/spec/integration/valid_examples_spec.rb +76 -0
  66. data/spec/open_graph_reader_spec.rb +94 -0
  67. data/spec/spec_helper.rb +35 -0
  68. metadata +247 -0
@@ -0,0 +1,82 @@
1
+ require 'faraday'
2
+
3
+ module OpenGraphReader
4
+ # Fetch an URI to retrieve its HTML body, if available.
5
+ #
6
+ # @api private
7
+ class Fetcher
8
+ # Create a new fetcher.
9
+ #
10
+ # @param [URI] uri the URI to fetch.
11
+ def initialize uri
12
+ raise ArgumentError, "url needs to be an instance of URI" unless uri.is_a? URI
13
+ @uri = uri
14
+ @connection = Faraday.default_connection.dup
15
+
16
+ if defined? FaradayMiddleware
17
+ unless @connection.builder.handlers.include? FaradayMiddleware::FollowRedirects
18
+ @connection.builder.insert(0, FaradayMiddleware::FollowRedirects)
19
+ end
20
+ end
21
+ end
22
+
23
+ # The URL to fetch
24
+ #
25
+ # @return [String]
26
+ def url
27
+ @uri.to_s
28
+ end
29
+
30
+ # Fetch the full page.
31
+ #
32
+ # @return [Faraday::Response]
33
+ def fetch
34
+ @get_response = @connection.get(@uri)
35
+ end
36
+ alias_method :fetch_body, :fetch
37
+
38
+ # Fetch just the headers
39
+ #
40
+ # @return [Faraday::Response]
41
+ def fetch_headers
42
+ @head_response = @connection.head(@uri)
43
+ end
44
+
45
+ # Retrieve the body
46
+ #
47
+ # @todo Custom error class
48
+ # @raise [ArgumentError] The received content does not seems to be HTML.
49
+ # @return [String]
50
+ def body
51
+ fetch_body unless fetched?
52
+ raise ArgumentError, "Did not receive a HTML site at #{@uri}" unless html?
53
+ @get_response.body
54
+ end
55
+
56
+ # Whether the target URI seems to return HTML
57
+ #
58
+ # @return [Bool]
59
+ def html?
60
+ fetch_headers unless fetched_headers?
61
+ response = @get_response || @head_response
62
+ return false unless response.success?
63
+ return false unless response['content-type']
64
+ response['content-type'].include? 'text/html'
65
+ end
66
+
67
+ # Whether the target URI was fetched.
68
+ #
69
+ # @return [Bool]
70
+ def fetched?
71
+ !@get_response.nil?
72
+ end
73
+ alias_method :fetched_body?, :fetched?
74
+
75
+ # Whether the headers of the target URI were fetched.
76
+ #
77
+ # @return [Bool]
78
+ def fetched_headers?
79
+ !@get_response.nil? || !@head_response.nil?
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,95 @@
1
+ require 'open_graph_reader/object/registry'
2
+ require 'open_graph_reader/object/dsl'
3
+ require 'open_graph_reader/object/dsl/types'
4
+
5
+ module OpenGraphReader
6
+
7
+ # This module provides the base functionality for all OpenGraph objects
8
+ # and makes the {DSL} methods for describing them available when included.
9
+ #
10
+ # @example Define a new object
11
+ # class MyObject
12
+ # include OpenGraphReader::Object
13
+ #
14
+ # namespace :my, :object
15
+ # content :string
16
+ # string :name, required: true
17
+ # end
18
+ module Object
19
+ # @private
20
+ def self.included base
21
+ base.extend DSL
22
+ end
23
+
24
+ # If the namespace this object represents had a value, it is available here
25
+ # @return [String, nil]
26
+ attr_reader :content
27
+
28
+ # Regular properties on this object
29
+ #
30
+ # @api private
31
+ # @return [{String => String, Object}]
32
+ attr_reader :properties
33
+
34
+ # Properties on this object that are arrays.
35
+ #
36
+ # @api private
37
+ # @return [{String => Array<String, Object>}]
38
+ attr_reader :children
39
+
40
+
41
+ # Create a new object. If your class overrides this don't forget to call <tt>super</tt>.
42
+ def initialize
43
+ @properties = {}
44
+ @children = Hash.new {|h, k| h[k] = [] }
45
+ end
46
+
47
+ # Whether this object has the given property
48
+ #
49
+ # @param [#to_s] name
50
+ # @return [Bool]
51
+ def has_property? name
52
+ self.class.available_properties.include? name.to_s
53
+ end
54
+
55
+ # Set the content for this object in case it is also a property on
56
+ # another object. If a processor is defined, it will be called.
57
+ #
58
+ # @api private
59
+ # @param [String] value
60
+ def content= value
61
+ value = self.class.content_processor.call(value)
62
+ @content = value
63
+ end
64
+
65
+ # Get a property on this object.
66
+ #
67
+ # @api private
68
+ # @param [#to_s] name
69
+ # @todo right error?
70
+ # @raise [InvalidObjectError] If the requested property is undefined.
71
+ # @return [String, Object]
72
+ def [] name
73
+ raise InvalidObjectError, "Undefined property #{name} on #{inspect}" unless has_property? name
74
+ properties[name.to_s]
75
+ end
76
+
77
+ # Set the property to the given value.
78
+ #
79
+ # @api private
80
+ # @param [#to_s] name
81
+ # @param [String, Object] value
82
+ # @raise [InvalidObjectError] If the requested property is undefined.
83
+ def []= name, value
84
+ raise InvalidObjectError, "Undefined property #{name} on #{inspect}" unless has_property? name
85
+ public_send "#{name}=", value
86
+ end
87
+
88
+ # Returns {#content} if available.
89
+ #
90
+ # @return [String]
91
+ def to_s
92
+ content || super
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,130 @@
1
+ require 'open_graph_reader/object/registry'
2
+
3
+ module OpenGraphReader
4
+ module Object
5
+ # This module provides the methods to define new types and properties,
6
+ # as well as setting other metadata necessary to describe an object, such
7
+ # as its namespace.
8
+ module DSL
9
+ # @!macro define_type_description
10
+ # @param [Symbol] name the name of the property in the current namespace
11
+ # @param [{Symbol => Bool, Class, Array<String>}] options additional options
12
+ # @option options [Bool] :required (false) Make the property required.
13
+ # @option options [Bool] :collection (false) This property can occur multiple times.
14
+ # @option options [Class] :to This property maps to the given object (optional).
15
+ # belongs to the given verticals of the object (optional).
16
+ # @option options [Array<String>] :verticials This property
17
+ #
18
+ # @!macro property
19
+ # @!attribute [rw] $1
20
+
21
+ # @!macro [attach] define_type
22
+ # @!method $1(name, options={})
23
+ # @!macro define_type_description
24
+ #
25
+ # Defines a new DSL method for modeling a new type
26
+ #
27
+ # @yield convert and validate
28
+ # @yieldparam [::Object] value the value to be converted and validated
29
+ # @yieldparam [Array<::Object>] *args any additional arguments
30
+ # @yieldparam [{Symbol => Bool, Class, Array<String>}] options the options hash as last parameter
31
+ def self.define_type(name, &processor)
32
+ processors[name] = processor
33
+
34
+ define_method(name) do |name, *args|
35
+ available_properties << name.to_s
36
+ options = args.pop if args.last.is_a? Hash
37
+ options ||= {}
38
+
39
+ Registry.register [@namespace, name].join(':'), options[:to] if options[:to]
40
+
41
+ if options[:verticals]
42
+ options[:verticals].each do |vertical|
43
+ verticals[[@namespace, vertical].join('.')] << name
44
+ end
45
+ end
46
+
47
+ if options[:collection]
48
+ define_method("#{name}s") do
49
+ children[name.to_s]
50
+ end
51
+
52
+ define_method(name) do
53
+ # TODO raise if required
54
+ value = children[name.to_s].first
55
+ # TODO: figure out a sane way to distinguish subobject properties
56
+ value.content if value && value.is_a?(Object)
57
+ value || options[:default]
58
+ end
59
+ else
60
+ define_method(name) do
61
+ # TODO raise if required
62
+ properties[name.to_s] || options[:default]
63
+ end
64
+
65
+ define_method("#{name}=") do |value|
66
+ # TODO: figure out a sane way to distinguish subobject properties
67
+ value = processor.call(value, *args, options) unless value.is_a? Object
68
+ properties[name.to_s] = value
69
+ end
70
+ end
71
+ end
72
+ end
73
+ singleton_class.send(:alias_method, :define_type_with_args, :define_type)
74
+
75
+ # @overload namespace
76
+ # Get the namespace of this object.
77
+ #
78
+ # @return [String] A colon separated namespace, for example <tt>og:image</tt>.
79
+ # @overload namespace(*names)
80
+ # Set the namespace of this object.
81
+ #
82
+ # @param [Array<#to_s>] *names The individual parts of the namespace as list
83
+ # @example
84
+ # namespace :og, :image
85
+ def namespace *names
86
+ return @namespace if names.empty?
87
+ @namespace = names.join(':')
88
+ Registry.register @namespace, self
89
+ end
90
+
91
+ # Set the type for the content attribute
92
+ #
93
+ # @param [Symbol] type one of the registered types.
94
+ def content type
95
+ @content_processor = DSL.processors[type]
96
+ end
97
+
98
+ # The list of defined properties on this object.
99
+ #
100
+ # @return [Array<String>]
101
+ def available_properties
102
+ @available_properties ||= []
103
+ end
104
+
105
+ # A map from type names to processing blocks.
106
+ #
107
+ # @api private
108
+ # @return [{Symbol => Proc}]
109
+ def self.processors
110
+ @processors ||= {}
111
+ end
112
+
113
+ # The processor for the content attribute.
114
+ #
115
+ # @api private
116
+ # @return [Proc]
117
+ def content_processor
118
+ @content_processor || proc {|value| value }
119
+ end
120
+
121
+ # A map from vertical names to attributes that belong to them.
122
+ #
123
+ # @api private
124
+ # @return [{String => Array<Strin>}]
125
+ def verticals
126
+ @verticals ||= Hash.new {|h, k| h[k] = [] }
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,71 @@
1
+ require 'date'
2
+
3
+ require 'open_graph_reader/object/dsl'
4
+
5
+ module OpenGraphReader
6
+ module Object
7
+ module DSL
8
+ # @see http://ogp.me/#string
9
+ define_type :string do |value|
10
+ value.to_s
11
+ end
12
+
13
+ # @see http://ogp.me/#url
14
+ define_type :url do |value|
15
+ value.to_s.tap {|value|
16
+ unless value.start_with?('http://') || value.start_with?('https://')
17
+ raise InvalidObjectError, "URL #{value.inspect} does not start with http:// or https://"
18
+ end
19
+ }
20
+ end
21
+
22
+ # @!method enum(name, allowed, options={})
23
+ # @param [Array<String>] allowed the list of allowed values
24
+ # @!macro define_type_description
25
+ # @see http://ogp.me/#enum
26
+ define_type_with_args :enum do |value, allowed|
27
+ unless allowed.include? value
28
+ raise InvalidObjectError, "Expected one of #{allowed.inspect} but was #{value.inspect}"
29
+ end
30
+
31
+ value.to_s
32
+ end
33
+
34
+ # @see http://ogp.me/#integer
35
+ define_type :integer do |value|
36
+ begin
37
+ Integer(value)
38
+ rescue ArgumentError => e
39
+ raise InvalidObjectError, "Integer expected, but was #{value.inspect}"
40
+ end
41
+ end
42
+
43
+ # @see http://ogp.me/#datetime
44
+ define_type :datetime do |value|
45
+ begin
46
+ DateTime.iso8601 value
47
+ rescue ArgumentError => e
48
+ raise InvalidObjectError, "ISO8601 datetime expected, but was #{value.inspect}"
49
+ end
50
+ end
51
+
52
+ # @see http://ogp.me/#bool
53
+ define_type :boolean do |value|
54
+ {'true' => true, 'false' => false, '1' => true, '0' => false}[value].tap {|bool|
55
+ if bool.nil?
56
+ raise InvalidObjectError, "Boolean expected, but was #{value.inspect}"
57
+ end
58
+ }
59
+ end
60
+
61
+ # @see http://ogp.me/#float
62
+ define_type :float do |value|
63
+ begin
64
+ Float(value)
65
+ rescue ArgumentError => e
66
+ raise InvalidObjectError, "Float expected, but was #{value.inspect}"
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,54 @@
1
+ require 'singleton'
2
+ require 'forwardable'
3
+
4
+ module OpenGraphReader
5
+ module Object
6
+ # Global registry of namespaces and their representing classes.
7
+ #
8
+ # @api private
9
+ class Registry
10
+ extend Forwardable
11
+ include Singleton
12
+
13
+ class << self
14
+ extend Forwardable
15
+ # @!method register(namespace, klass)
16
+ # Register a new namespace in the registry.
17
+ #
18
+ # @param [String] namespace The namespace in colon separated form, for example <tt>og:image</tt>.
19
+ # @param [Class] klass The class to register. It should include {Object}.
20
+ # @api private
21
+ #
22
+ # @!method registered?(namespace)
23
+ # Check whether a namespace is registered.
24
+ #
25
+ # @param [String] namespace The namespace in colon separated form, for example <tt>og:image</tt>.
26
+ # @return [Bool]
27
+ # @api private
28
+ #
29
+ # @!method [](namespace)
30
+ # Fetch the class associated with the given namespace
31
+ #
32
+ # @param [String] namespace The namespace in colon separated form, for example <tt>og:image</tt>.
33
+ # @return [Class] The matching class.
34
+ # @raise [ArgumentError] If the given namespace wasn't registered.
35
+ # @api private
36
+ def_delegators :instance, :register, :registered?, :[]
37
+ end
38
+
39
+ def_delegators :@namespaces, :[]=, :has_key?
40
+ alias_method :register, :[]=
41
+ alias_method :registered?, :has_key?
42
+
43
+ def initialize
44
+ @namespaces = {}
45
+ end
46
+
47
+ # @see Registry.[]
48
+ def [] namespace
49
+ raise ArgumentError, "#{namespace} is not a registered namespace" unless registered? namespace
50
+ @namespaces[namespace]
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,85 @@
1
+ require 'nokogiri'
2
+
3
+ require 'open_graph_reader/parser/graph'
4
+
5
+ module OpenGraphReader
6
+ # Parse OpenGraph tags in a HTML document into a graph.
7
+ #
8
+ # @api private
9
+ class Parser
10
+ # Namespaces found in the passed documents head tag
11
+ #
12
+ # @return [Array<String>]
13
+ attr_reader :additional_namespaces
14
+
15
+ # Create a new parser.
16
+ #
17
+ # @param [#to_s, Nokogiri::XML::Node] html the document to parse.
18
+ # @param [String] origin The source the document was obtained from.
19
+ def initialize html, origin=nil
20
+ @doc = to_doc html
21
+ @origin = origin
22
+ @additional_namespaces = []
23
+ end
24
+
25
+ # Whether there are any OpenGraph tags at all.
26
+ #
27
+ # @return [Bool]
28
+ def has_tags?
29
+ !graph.empty?
30
+ end
31
+
32
+ # Build and return the {Graph}.
33
+ #
34
+ # @return [Graph]
35
+ def graph
36
+ @graph ||= build_graph
37
+ end
38
+
39
+ private
40
+
41
+ def build_graph
42
+ graph = Graph.new
43
+ head = @doc.xpath('/html/head').first
44
+
45
+ raise NoOpenGraphDataError, "There's no head tag in #{@doc}" unless head
46
+
47
+ condition = "starts-with(@property, 'og:')"
48
+ if head['prefix']
49
+ @additional_namespaces = head['prefix'].scan(/(\w+):\s*([^ ]+)/).map(&:first)
50
+ @additional_namespaces.each do |additional_namespace|
51
+ next if additional_namespace == 'og'
52
+ condition << " or starts-with(@property, '#{additional_namespace}')"
53
+ end
54
+ end
55
+
56
+ head.xpath("meta[#{condition}]").each do |tag|
57
+ *path, leaf = tag['property'].split(':')
58
+ node = path.inject(graph.root) {|node, name|
59
+ child = node.children.reverse.find {|child| child.name == name }
60
+
61
+ unless child
62
+ child = Graph::Node.new name
63
+ node << child
64
+ end
65
+
66
+ child
67
+ }
68
+
69
+ # TODO: make stripping configurable?
70
+ node << Graph::Node.new(leaf, tag['content'].strip)
71
+ end
72
+
73
+ graph
74
+ end
75
+
76
+ def to_doc html
77
+ case html
78
+ when Nokogiri::XML::Node
79
+ html
80
+ else
81
+ Nokogiri::HTML.parse(html.to_s)
82
+ end
83
+ end
84
+ end
85
+ end