micromicro 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.editorconfig +14 -0
- data/.gitignore +34 -0
- data/.gitmodules +3 -0
- data/.reek.yml +8 -0
- data/.rspec +2 -0
- data/.rubocop +3 -0
- data/.rubocop.yml +25 -0
- data/.ruby-version +1 -0
- data/.simplecov +11 -0
- data/.travis.yml +19 -0
- data/CHANGELOG.md +5 -0
- data/CONTRIBUTING.md +37 -0
- data/Gemfile +14 -0
- data/LICENSE +21 -0
- data/README.md +122 -0
- data/Rakefile +18 -0
- data/lib/micro_micro/collections/base_collection.rb +37 -0
- data/lib/micro_micro/collections/items_collection.rb +10 -0
- data/lib/micro_micro/collections/properties_collection.rb +18 -0
- data/lib/micro_micro/collections/relations_collection.rb +23 -0
- data/lib/micro_micro/document.rb +71 -0
- data/lib/micro_micro/implied_property.rb +25 -0
- data/lib/micro_micro/item.rb +151 -0
- data/lib/micro_micro/parsers/base_property_parser.rb +33 -0
- data/lib/micro_micro/parsers/date_time_parser.rb +85 -0
- data/lib/micro_micro/parsers/date_time_property_parser.rb +65 -0
- data/lib/micro_micro/parsers/embedded_markup_property_parser.rb +28 -0
- data/lib/micro_micro/parsers/implied_name_property_parser.rb +78 -0
- data/lib/micro_micro/parsers/implied_photo_property_parser.rb +69 -0
- data/lib/micro_micro/parsers/implied_url_property_parser.rb +61 -0
- data/lib/micro_micro/parsers/plain_text_property_parser.rb +39 -0
- data/lib/micro_micro/parsers/url_property_parser.rb +75 -0
- data/lib/micro_micro/parsers/value_class_pattern_parser.rb +92 -0
- data/lib/micro_micro/property.rb +116 -0
- data/lib/micro_micro/relation.rb +78 -0
- data/lib/micro_micro/version.rb +3 -0
- data/lib/micromicro.rb +39 -0
- data/micromicro.gemspec +28 -0
- metadata +128 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
class Document
|
3
|
+
# @param markup [String] the HTML to parse
|
4
|
+
# @param base_url [String] the URL associated with the provided markup
|
5
|
+
def initialize(markup, base_url)
|
6
|
+
@markup = markup
|
7
|
+
@base_url = base_url
|
8
|
+
end
|
9
|
+
|
10
|
+
# @return [String]
|
11
|
+
def inspect
|
12
|
+
format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relations: #{relations.inspect}>), object_id)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @return [MicroMicro::Collections::ItemsCollection]
|
16
|
+
def items
|
17
|
+
@items ||= Collections::ItemsCollection.new(Item.items_from(document))
|
18
|
+
end
|
19
|
+
|
20
|
+
# @return [MicroMicro::Collections::RelationsCollection]
|
21
|
+
def relations
|
22
|
+
@relations ||= Collections::RelationsCollection.new(Relation.relations_from(document))
|
23
|
+
end
|
24
|
+
|
25
|
+
# @see microformats2 Parsing Specification section 1.1
|
26
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parse_a_document_for_microformats
|
27
|
+
#
|
28
|
+
# @return [Hash]
|
29
|
+
def to_h
|
30
|
+
{
|
31
|
+
items: items.to_a,
|
32
|
+
rels: relations.group_by_rel,
|
33
|
+
'rel-urls': relations.group_by_url
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param node [Nokogiri::XML::Element]
|
38
|
+
# @return [Boolean]
|
39
|
+
def self.ignore_node?(node)
|
40
|
+
ignored_node_names.include?(node.name)
|
41
|
+
end
|
42
|
+
|
43
|
+
# @return [Array<String>]
|
44
|
+
def self.ignored_node_names
|
45
|
+
%w[script style template]
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
attr_reader :base_url, :markup
|
51
|
+
|
52
|
+
# @return [Nokogiri::XML::Element, nil]
|
53
|
+
def base_element
|
54
|
+
@base_element ||= Nokogiri::HTML(markup).at_css('base[href]')
|
55
|
+
end
|
56
|
+
|
57
|
+
# @return [Nokogiri::HTML::Document]
|
58
|
+
def document
|
59
|
+
@document ||= Nokogiri::HTML(markup, resolved_base_url)
|
60
|
+
end
|
61
|
+
|
62
|
+
# @return [String]
|
63
|
+
def resolved_base_url
|
64
|
+
@resolved_base_url ||= begin
|
65
|
+
return base_url unless base_element
|
66
|
+
|
67
|
+
Absolutely.to_abs(base: base_url, relative: base_element['href'])
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
class ImpliedProperty < Property
|
3
|
+
IMPLIED_PROPERTY_PARSERS_MAP = {
|
4
|
+
'name' => Parsers::ImpliedNamePropertyParser,
|
5
|
+
'photo' => Parsers::ImpliedPhotoPropertyParser,
|
6
|
+
'url' => Parsers::ImpliedUrlPropertyParser
|
7
|
+
}.freeze
|
8
|
+
|
9
|
+
# @return [Boolean]
|
10
|
+
def implied?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [Boolean]
|
15
|
+
def item_node?
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def parser
|
22
|
+
@parser ||= IMPLIED_PROPERTY_PARSERS_MAP[name].new(self)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
class Item
|
3
|
+
attr_accessor :value
|
4
|
+
|
5
|
+
# @param node [Nokogiri::XML::Element]
|
6
|
+
def initialize(node)
|
7
|
+
@node = node
|
8
|
+
|
9
|
+
properties << implied_name if implied_name?
|
10
|
+
properties << implied_photo if implied_photo?
|
11
|
+
properties << implied_url if implied_url?
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [MicroMicro::Collections::ItemsCollection]
|
15
|
+
def children
|
16
|
+
@children ||= Collections::ItemsCollection.new(Item.items_from(node.element_children))
|
17
|
+
end
|
18
|
+
|
19
|
+
# @return [String, nil]
|
20
|
+
def id
|
21
|
+
@id ||= node['id']&.strip
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [String]
|
25
|
+
def inspect
|
26
|
+
format(%(#<#{self.class.name}:%#0x types: #{types.inspect}, properties: #{properties.count}, children: #{children.count}>), object_id)
|
27
|
+
end
|
28
|
+
|
29
|
+
# @return [MicroMicro::Collections::PropertiesCollection]
|
30
|
+
def properties
|
31
|
+
@properties ||= Collections::PropertiesCollection.new(Property.properties_from(node.element_children))
|
32
|
+
end
|
33
|
+
|
34
|
+
# @see microformats2 Parsing Specification section 1.2
|
35
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parse_an_element_for_class_microformats
|
36
|
+
#
|
37
|
+
# @return [Hash]
|
38
|
+
def to_h
|
39
|
+
hash = {
|
40
|
+
type: types,
|
41
|
+
properties: properties.to_h
|
42
|
+
}
|
43
|
+
|
44
|
+
hash[:id] = id if id.present?
|
45
|
+
hash[:children] = children.to_a if children.any?
|
46
|
+
hash[:value] = value if value.present?
|
47
|
+
|
48
|
+
hash
|
49
|
+
end
|
50
|
+
|
51
|
+
# @return [Array<String>]
|
52
|
+
def types
|
53
|
+
@types ||= self.class.types_from(node)
|
54
|
+
end
|
55
|
+
|
56
|
+
# @param node [Nokogiri::XML::Element]
|
57
|
+
# @return [Boolean]
|
58
|
+
def self.item_node?(node)
|
59
|
+
types_from(node).any?
|
60
|
+
end
|
61
|
+
|
62
|
+
# @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
|
63
|
+
# @return [Array<MicroMicro::Item>]
|
64
|
+
def self.items_from(context)
|
65
|
+
nodes_from(context).map { |node| new(node) }
|
66
|
+
end
|
67
|
+
|
68
|
+
# @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
|
69
|
+
# @param node_set [Nokogiri::XML::NodeSet]
|
70
|
+
# @return [Nokogiri::XML::NodeSet]
|
71
|
+
def self.nodes_from(context, node_set = Nokogiri::XML::NodeSet.new(context.document, []))
|
72
|
+
return nodes_from(context.element_children, node_set) if context.is_a?(Nokogiri::HTML::Document)
|
73
|
+
|
74
|
+
context.each { |node| nodes_from(node, node_set) } if context.is_a?(Nokogiri::XML::NodeSet)
|
75
|
+
|
76
|
+
if context.is_a?(Nokogiri::XML::Element) && !Document.ignore_node?(context)
|
77
|
+
if item_node?(context)
|
78
|
+
node_set << context unless Property.property_node?(context)
|
79
|
+
else
|
80
|
+
nodes_from(context.element_children, node_set)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
node_set
|
85
|
+
end
|
86
|
+
|
87
|
+
# @param node [Nokogiri::XML::Element]
|
88
|
+
# @return [Array<String>]
|
89
|
+
#
|
90
|
+
# @example
|
91
|
+
# node = Nokogiri::HTML('<div class="h-card">Jason Garber</div>').at_css('div')
|
92
|
+
# MicroMicro::Item.types_from(node) #=> ['h-card']
|
93
|
+
def self.types_from(node)
|
94
|
+
node.classes.select { |token| token.match?(/^h(?:\-[0-9a-z]+)?(?:\-[a-z]+)+$/) }.uniq.sort
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
attr_reader :node
|
100
|
+
|
101
|
+
# @return [MicroMicro::ImpliedProperty]
|
102
|
+
def implied_name
|
103
|
+
@implied_name ||= ImpliedProperty.new(node, name: 'name', prefix: 'p')
|
104
|
+
end
|
105
|
+
|
106
|
+
# @return [Boolean]
|
107
|
+
def implied_name?
|
108
|
+
imply_name? && implied_name.value?
|
109
|
+
end
|
110
|
+
|
111
|
+
# @return [MicroMicro::ImpliedProperty]
|
112
|
+
def implied_photo
|
113
|
+
@implied_photo ||= ImpliedProperty.new(node, name: 'photo', prefix: 'u')
|
114
|
+
end
|
115
|
+
|
116
|
+
# @return [Boolean]
|
117
|
+
def implied_photo?
|
118
|
+
imply_photo? && implied_photo.value?
|
119
|
+
end
|
120
|
+
|
121
|
+
# @return [MicroMicro::ImpliedProperty]
|
122
|
+
def implied_url
|
123
|
+
@implied_url ||= ImpliedProperty.new(node, name: 'url', prefix: 'u')
|
124
|
+
end
|
125
|
+
|
126
|
+
# @return [Boolean]
|
127
|
+
def implied_url?
|
128
|
+
imply_url? && implied_url.value?
|
129
|
+
end
|
130
|
+
|
131
|
+
# @return [Boolean]
|
132
|
+
def imply_name?
|
133
|
+
properties.none? { |prop| prop.name == 'name' } && properties.none? { |prop| %w[e p].include?(prop.prefix) } && !nested_items?
|
134
|
+
end
|
135
|
+
|
136
|
+
# @return [Boolean]
|
137
|
+
def imply_photo?
|
138
|
+
properties.none? { |prop| prop.name == 'photo' } && properties.reject(&:implied?).none? { |prop| prop.prefix == 'u' } && !nested_items?
|
139
|
+
end
|
140
|
+
|
141
|
+
# @return [Boolean]
|
142
|
+
def imply_url?
|
143
|
+
properties.none? { |prop| prop.name == 'url' } && properties.reject(&:implied?).none? { |prop| prop.prefix == 'u' } && !nested_items?
|
144
|
+
end
|
145
|
+
|
146
|
+
# @return [Boolean]
|
147
|
+
def nested_items?
|
148
|
+
@nested_items ||= properties.find(&:item_node?) || children.any?
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
module Parsers
|
3
|
+
class BasePropertyParser
|
4
|
+
# @param property [MicroMicro::Property, MicroMicro::ImpliedProperty]
|
5
|
+
def initialize(property)
|
6
|
+
@property = property
|
7
|
+
@node = property.node
|
8
|
+
end
|
9
|
+
|
10
|
+
# @return [String]
|
11
|
+
def value
|
12
|
+
@value ||= serialized_node.text.strip
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
attr_reader :node, :property
|
18
|
+
|
19
|
+
# @see microformats2 Parsing Specification sections 1.3.1 and 1.3.4
|
20
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parsing_a_p-_property
|
21
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parsing_an_e-_property
|
22
|
+
def serialized_node
|
23
|
+
@serialized_node ||= begin
|
24
|
+
node.css(*Document.ignored_node_names).unlink
|
25
|
+
|
26
|
+
node.css('img').each { |img| img.content = " #{img['alt'] || Absolutely.to_abs(base: node.document.url, relative: img['src'])} " }
|
27
|
+
|
28
|
+
node
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
module Parsers
|
3
|
+
class DateTimeParser
|
4
|
+
# @see Value Class Pattern section 4.2
|
5
|
+
# @see http://microformats.org/wiki/value-class-pattern#Date_and_time_parsing
|
6
|
+
#
|
7
|
+
# Regexp pattern matching YYYY-MM-DD and YYY-DDD
|
8
|
+
DATE_REGEXP_PATTERN = '(?<year>\d{4})-((?<ordinal>3[0-6]{2}|[0-2]\d{2})|(?<month>0\d|1[0-2])-(?<day>3[0-1]|[0-2]\d))'.freeze
|
9
|
+
# Regexp pattern matching HH:MM and HH:MM:SS
|
10
|
+
TIME_REGEXP_PATTERN = '(?<hours>2[0-3]|[0-1]?\d)(?::(?<minutes>[0-5]\d))?(?::(?<seconds>[0-5]\d))?(?:\s*?(?<abbreviation>[apPP]\.?[mM]\.?))?'.freeze
|
11
|
+
# Regexp pattern matching +/-(XX:YY|XXYY|XX) or the literal string Z
|
12
|
+
TIMEZONE_REGEXP_PATTERN = '(?<zulu>Z)|(?<offset>(?:\+|-)(?:1[0-2]|0?\d)(?::?[0-5]\d)?)'.freeze
|
13
|
+
|
14
|
+
CAPTURE_NAMES = [:year, :ordinal, :month, :day, :hours, :minutes, :seconds, :abbreviation, :zulu, :offset].freeze
|
15
|
+
|
16
|
+
# @param string [String]
|
17
|
+
def initialize(string)
|
18
|
+
@string = string
|
19
|
+
end
|
20
|
+
|
21
|
+
CAPTURE_NAMES.each do |name|
|
22
|
+
define_method(name) { values[name] }
|
23
|
+
define_method("#{name}?") { public_send(name).present? }
|
24
|
+
end
|
25
|
+
|
26
|
+
def normalized_calendar_date
|
27
|
+
@normalized_calendar_date ||= "#{year}-#{month}-#{day}" if year? && month? && day?
|
28
|
+
end
|
29
|
+
|
30
|
+
def normalized_date
|
31
|
+
@normalized_date ||= normalized_calendar_date || normalized_ordinal_date
|
32
|
+
end
|
33
|
+
|
34
|
+
def normalized_hours
|
35
|
+
@normalized_hours ||= begin
|
36
|
+
return unless hours?
|
37
|
+
return (hours.to_i + 12).to_s if abbreviation&.tr('.', '')&.downcase == 'pm'
|
38
|
+
|
39
|
+
format('%<hours>02d', hours: hours)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def normalized_minutes
|
44
|
+
@normalized_minutes ||= minutes || '00'
|
45
|
+
end
|
46
|
+
|
47
|
+
def normalized_ordinal_date
|
48
|
+
@normalized_ordinal_date ||= "#{year}-#{ordinal}" if year? && ordinal?
|
49
|
+
end
|
50
|
+
|
51
|
+
def normalized_time
|
52
|
+
@normalized_time ||= [normalized_hours, normalized_minutes, seconds].compact.join(':') if normalized_hours
|
53
|
+
end
|
54
|
+
|
55
|
+
def normalized_timezone
|
56
|
+
@normalized_timezone ||= zulu || offset&.tr(':', '')
|
57
|
+
end
|
58
|
+
|
59
|
+
# @return [String]
|
60
|
+
def value
|
61
|
+
@value ||= "#{normalized_date} #{normalized_time}#{normalized_timezone}".strip
|
62
|
+
end
|
63
|
+
|
64
|
+
# @return [Boolean]
|
65
|
+
def value?
|
66
|
+
value.present?
|
67
|
+
end
|
68
|
+
|
69
|
+
# @return [Hash{Symbol => String, nil}]
|
70
|
+
def values
|
71
|
+
@values ||= self.class.values_from(string)
|
72
|
+
end
|
73
|
+
|
74
|
+
# @param string [String]
|
75
|
+
# @return [Hash{Symbol => String, nil}]
|
76
|
+
def self.values_from(string)
|
77
|
+
string&.match(/^(?:#{DATE_REGEXP_PATTERN})?(?:\s?#{TIME_REGEXP_PATTERN}(?:#{TIMEZONE_REGEXP_PATTERN})?)?$/)&.named_captures.to_h.symbolize_keys
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
attr_reader :string
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
module Parsers
|
3
|
+
class DateTimePropertyParser < BasePropertyParser
|
4
|
+
# @see microformats2 Parsing Specification section 1.3.3
|
5
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parsing_a_dt-_property
|
6
|
+
HTML_ATTRIBUTES_MAP = {
|
7
|
+
'datetime' => %w[del ins time],
|
8
|
+
'title' => %w[abbr],
|
9
|
+
'value' => %w[data input]
|
10
|
+
}.freeze
|
11
|
+
|
12
|
+
# @return [String]
|
13
|
+
def value
|
14
|
+
@value ||= begin
|
15
|
+
return resolved_value if date_time_parser.value?
|
16
|
+
return attribute_values.first if attribute_values.any?
|
17
|
+
|
18
|
+
super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
# @return [MicroMicro::Parsers::DateTimeParser, nil]
|
25
|
+
def adopted_date_time
|
26
|
+
@adopted_date_time ||= begin
|
27
|
+
collections = property.collection.select { |prop| prop.prefix == 'dt' }.split(property)
|
28
|
+
|
29
|
+
(collections.shift.reverse + collections).flatten.map { |prop| DateTimeParser.new(prop.value) }.find(&:normalized_date)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Array<String>]
|
34
|
+
def attribute_values
|
35
|
+
@attribute_values ||= begin
|
36
|
+
HTML_ATTRIBUTES_MAP.map do |attribute, names|
|
37
|
+
node[attribute] if names.include?(node.name) && node[attribute]
|
38
|
+
end.compact
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [MicroMicro::Parsers::DateTimeParser]
|
43
|
+
def date_time_parser
|
44
|
+
@date_time_parser ||= DateTimeParser.new(value_class_pattern_parser.value)
|
45
|
+
end
|
46
|
+
|
47
|
+
# @return [Boolean]
|
48
|
+
def imply_date?
|
49
|
+
date_time_parser.normalized_time && !date_time_parser.normalized_date
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [String]
|
53
|
+
def resolved_value
|
54
|
+
return "#{adopted_date_time.normalized_date} #{date_time_parser.value}" if imply_date? && adopted_date_time
|
55
|
+
|
56
|
+
date_time_parser.value
|
57
|
+
end
|
58
|
+
|
59
|
+
# @return [MicroMicro::Parsers::ValueClassPatternParser]
|
60
|
+
def value_class_pattern_parser
|
61
|
+
ValueClassPatternParser.new(node, ' ')
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
module Parsers
|
3
|
+
class EmbeddedMarkupPropertyParser < BasePropertyParser
|
4
|
+
HTML_ATTRIBUTE_NAMES = %w[action cite code codebase data href poster src].freeze
|
5
|
+
|
6
|
+
def value
|
7
|
+
@value ||= begin
|
8
|
+
{
|
9
|
+
html: resolved_node.inner_html.strip,
|
10
|
+
value: super
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def resolved_node
|
18
|
+
@resolved_node ||= begin
|
19
|
+
HTML_ATTRIBUTE_NAMES.each do |attribute|
|
20
|
+
node.css("[#{attribute}]").each { |element| element[attribute] = Absolutely.to_abs(base: node.document.url, relative: element[attribute].strip) }
|
21
|
+
end
|
22
|
+
|
23
|
+
node
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|