micromicro 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.editorconfig +14 -0
- data/.gitignore +34 -0
- data/.gitmodules +3 -0
- data/.reek.yml +8 -0
- data/.rspec +2 -0
- data/.rubocop +3 -0
- data/.rubocop.yml +25 -0
- data/.ruby-version +1 -0
- data/.simplecov +11 -0
- data/.travis.yml +19 -0
- data/CHANGELOG.md +5 -0
- data/CONTRIBUTING.md +37 -0
- data/Gemfile +14 -0
- data/LICENSE +21 -0
- data/README.md +122 -0
- data/Rakefile +18 -0
- data/lib/micro_micro/collections/base_collection.rb +37 -0
- data/lib/micro_micro/collections/items_collection.rb +10 -0
- data/lib/micro_micro/collections/properties_collection.rb +18 -0
- data/lib/micro_micro/collections/relations_collection.rb +23 -0
- data/lib/micro_micro/document.rb +71 -0
- data/lib/micro_micro/implied_property.rb +25 -0
- data/lib/micro_micro/item.rb +151 -0
- data/lib/micro_micro/parsers/base_property_parser.rb +33 -0
- data/lib/micro_micro/parsers/date_time_parser.rb +85 -0
- data/lib/micro_micro/parsers/date_time_property_parser.rb +65 -0
- data/lib/micro_micro/parsers/embedded_markup_property_parser.rb +28 -0
- data/lib/micro_micro/parsers/implied_name_property_parser.rb +78 -0
- data/lib/micro_micro/parsers/implied_photo_property_parser.rb +69 -0
- data/lib/micro_micro/parsers/implied_url_property_parser.rb +61 -0
- data/lib/micro_micro/parsers/plain_text_property_parser.rb +39 -0
- data/lib/micro_micro/parsers/url_property_parser.rb +75 -0
- data/lib/micro_micro/parsers/value_class_pattern_parser.rb +92 -0
- data/lib/micro_micro/property.rb +116 -0
- data/lib/micro_micro/relation.rb +78 -0
- data/lib/micro_micro/version.rb +3 -0
- data/lib/micromicro.rb +39 -0
- data/micromicro.gemspec +28 -0
- metadata +128 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
class Document
|
3
|
+
# @param markup [String] the HTML to parse
|
4
|
+
# @param base_url [String] the URL associated with the provided markup
|
5
|
+
def initialize(markup, base_url)
|
6
|
+
@markup = markup
|
7
|
+
@base_url = base_url
|
8
|
+
end
|
9
|
+
|
10
|
+
# @return [String]
|
11
|
+
def inspect
|
12
|
+
format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relations: #{relations.inspect}>), object_id)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @return [MicroMicro::Collections::ItemsCollection]
|
16
|
+
def items
|
17
|
+
@items ||= Collections::ItemsCollection.new(Item.items_from(document))
|
18
|
+
end
|
19
|
+
|
20
|
+
# @return [MicroMicro::Collections::RelationsCollection]
|
21
|
+
def relations
|
22
|
+
@relations ||= Collections::RelationsCollection.new(Relation.relations_from(document))
|
23
|
+
end
|
24
|
+
|
25
|
+
# @see microformats2 Parsing Specification section 1.1
|
26
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parse_a_document_for_microformats
|
27
|
+
#
|
28
|
+
# @return [Hash]
|
29
|
+
def to_h
|
30
|
+
{
|
31
|
+
items: items.to_a,
|
32
|
+
rels: relations.group_by_rel,
|
33
|
+
'rel-urls': relations.group_by_url
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param node [Nokogiri::XML::Element]
|
38
|
+
# @return [Boolean]
|
39
|
+
def self.ignore_node?(node)
|
40
|
+
ignored_node_names.include?(node.name)
|
41
|
+
end
|
42
|
+
|
43
|
+
# @return [Array<String>]
|
44
|
+
def self.ignored_node_names
|
45
|
+
%w[script style template]
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
attr_reader :base_url, :markup
|
51
|
+
|
52
|
+
# @return [Nokogiri::XML::Element, nil]
|
53
|
+
def base_element
|
54
|
+
@base_element ||= Nokogiri::HTML(markup).at_css('base[href]')
|
55
|
+
end
|
56
|
+
|
57
|
+
# @return [Nokogiri::HTML::Document]
|
58
|
+
def document
|
59
|
+
@document ||= Nokogiri::HTML(markup, resolved_base_url)
|
60
|
+
end
|
61
|
+
|
62
|
+
# @return [String]
|
63
|
+
def resolved_base_url
|
64
|
+
@resolved_base_url ||= begin
|
65
|
+
return base_url unless base_element
|
66
|
+
|
67
|
+
Absolutely.to_abs(base: base_url, relative: base_element['href'])
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
class ImpliedProperty < Property
|
3
|
+
IMPLIED_PROPERTY_PARSERS_MAP = {
|
4
|
+
'name' => Parsers::ImpliedNamePropertyParser,
|
5
|
+
'photo' => Parsers::ImpliedPhotoPropertyParser,
|
6
|
+
'url' => Parsers::ImpliedUrlPropertyParser
|
7
|
+
}.freeze
|
8
|
+
|
9
|
+
# @return [Boolean]
|
10
|
+
def implied?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [Boolean]
|
15
|
+
def item_node?
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def parser
|
22
|
+
@parser ||= IMPLIED_PROPERTY_PARSERS_MAP[name].new(self)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
class Item
|
3
|
+
attr_accessor :value
|
4
|
+
|
5
|
+
# @param node [Nokogiri::XML::Element]
|
6
|
+
def initialize(node)
|
7
|
+
@node = node
|
8
|
+
|
9
|
+
properties << implied_name if implied_name?
|
10
|
+
properties << implied_photo if implied_photo?
|
11
|
+
properties << implied_url if implied_url?
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [MicroMicro::Collections::ItemsCollection]
|
15
|
+
def children
|
16
|
+
@children ||= Collections::ItemsCollection.new(Item.items_from(node.element_children))
|
17
|
+
end
|
18
|
+
|
19
|
+
# @return [String, nil]
|
20
|
+
def id
|
21
|
+
@id ||= node['id']&.strip
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [String]
|
25
|
+
def inspect
|
26
|
+
format(%(#<#{self.class.name}:%#0x types: #{types.inspect}, properties: #{properties.count}, children: #{children.count}>), object_id)
|
27
|
+
end
|
28
|
+
|
29
|
+
# @return [MicroMicro::Collections::PropertiesCollection]
|
30
|
+
def properties
|
31
|
+
@properties ||= Collections::PropertiesCollection.new(Property.properties_from(node.element_children))
|
32
|
+
end
|
33
|
+
|
34
|
+
# @see microformats2 Parsing Specification section 1.2
|
35
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parse_an_element_for_class_microformats
|
36
|
+
#
|
37
|
+
# @return [Hash]
|
38
|
+
def to_h
|
39
|
+
hash = {
|
40
|
+
type: types,
|
41
|
+
properties: properties.to_h
|
42
|
+
}
|
43
|
+
|
44
|
+
hash[:id] = id if id.present?
|
45
|
+
hash[:children] = children.to_a if children.any?
|
46
|
+
hash[:value] = value if value.present?
|
47
|
+
|
48
|
+
hash
|
49
|
+
end
|
50
|
+
|
51
|
+
# @return [Array<String>]
|
52
|
+
def types
|
53
|
+
@types ||= self.class.types_from(node)
|
54
|
+
end
|
55
|
+
|
56
|
+
# @param node [Nokogiri::XML::Element]
|
57
|
+
# @return [Boolean]
|
58
|
+
def self.item_node?(node)
|
59
|
+
types_from(node).any?
|
60
|
+
end
|
61
|
+
|
62
|
+
# @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
|
63
|
+
# @return [Array<MicroMicro::Item>]
|
64
|
+
def self.items_from(context)
|
65
|
+
nodes_from(context).map { |node| new(node) }
|
66
|
+
end
|
67
|
+
|
68
|
+
# @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
|
69
|
+
# @param node_set [Nokogiri::XML::NodeSet]
|
70
|
+
# @return [Nokogiri::XML::NodeSet]
|
71
|
+
def self.nodes_from(context, node_set = Nokogiri::XML::NodeSet.new(context.document, []))
|
72
|
+
return nodes_from(context.element_children, node_set) if context.is_a?(Nokogiri::HTML::Document)
|
73
|
+
|
74
|
+
context.each { |node| nodes_from(node, node_set) } if context.is_a?(Nokogiri::XML::NodeSet)
|
75
|
+
|
76
|
+
if context.is_a?(Nokogiri::XML::Element) && !Document.ignore_node?(context)
|
77
|
+
if item_node?(context)
|
78
|
+
node_set << context unless Property.property_node?(context)
|
79
|
+
else
|
80
|
+
nodes_from(context.element_children, node_set)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
node_set
|
85
|
+
end
|
86
|
+
|
87
|
+
# @param node [Nokogiri::XML::Element]
|
88
|
+
# @return [Array<String>]
|
89
|
+
#
|
90
|
+
# @example
|
91
|
+
# node = Nokogiri::HTML('<div class="h-card">Jason Garber</div>').at_css('div')
|
92
|
+
# MicroMicro::Item.types_from(node) #=> ['h-card']
|
93
|
+
def self.types_from(node)
|
94
|
+
node.classes.select { |token| token.match?(/^h(?:\-[0-9a-z]+)?(?:\-[a-z]+)+$/) }.uniq.sort
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
attr_reader :node
|
100
|
+
|
101
|
+
# @return [MicroMicro::ImpliedProperty]
|
102
|
+
def implied_name
|
103
|
+
@implied_name ||= ImpliedProperty.new(node, name: 'name', prefix: 'p')
|
104
|
+
end
|
105
|
+
|
106
|
+
# @return [Boolean]
|
107
|
+
def implied_name?
|
108
|
+
imply_name? && implied_name.value?
|
109
|
+
end
|
110
|
+
|
111
|
+
# @return [MicroMicro::ImpliedProperty]
|
112
|
+
def implied_photo
|
113
|
+
@implied_photo ||= ImpliedProperty.new(node, name: 'photo', prefix: 'u')
|
114
|
+
end
|
115
|
+
|
116
|
+
# @return [Boolean]
|
117
|
+
def implied_photo?
|
118
|
+
imply_photo? && implied_photo.value?
|
119
|
+
end
|
120
|
+
|
121
|
+
# @return [MicroMicro::ImpliedProperty]
|
122
|
+
def implied_url
|
123
|
+
@implied_url ||= ImpliedProperty.new(node, name: 'url', prefix: 'u')
|
124
|
+
end
|
125
|
+
|
126
|
+
# @return [Boolean]
|
127
|
+
def implied_url?
|
128
|
+
imply_url? && implied_url.value?
|
129
|
+
end
|
130
|
+
|
131
|
+
# @return [Boolean]
|
132
|
+
def imply_name?
|
133
|
+
properties.none? { |prop| prop.name == 'name' } && properties.none? { |prop| %w[e p].include?(prop.prefix) } && !nested_items?
|
134
|
+
end
|
135
|
+
|
136
|
+
# @return [Boolean]
|
137
|
+
def imply_photo?
|
138
|
+
properties.none? { |prop| prop.name == 'photo' } && properties.reject(&:implied?).none? { |prop| prop.prefix == 'u' } && !nested_items?
|
139
|
+
end
|
140
|
+
|
141
|
+
# @return [Boolean]
|
142
|
+
def imply_url?
|
143
|
+
properties.none? { |prop| prop.name == 'url' } && properties.reject(&:implied?).none? { |prop| prop.prefix == 'u' } && !nested_items?
|
144
|
+
end
|
145
|
+
|
146
|
+
# @return [Boolean]
|
147
|
+
def nested_items?
|
148
|
+
@nested_items ||= properties.find(&:item_node?) || children.any?
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
module Parsers
|
3
|
+
class BasePropertyParser
|
4
|
+
# @param property [MicroMicro::Property, MicroMicro::ImpliedProperty]
|
5
|
+
def initialize(property)
|
6
|
+
@property = property
|
7
|
+
@node = property.node
|
8
|
+
end
|
9
|
+
|
10
|
+
# @return [String]
|
11
|
+
def value
|
12
|
+
@value ||= serialized_node.text.strip
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
attr_reader :node, :property
|
18
|
+
|
19
|
+
# @see microformats2 Parsing Specification sections 1.3.1 and 1.3.4
|
20
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parsing_a_p-_property
|
21
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parsing_an_e-_property
|
22
|
+
def serialized_node
|
23
|
+
@serialized_node ||= begin
|
24
|
+
node.css(*Document.ignored_node_names).unlink
|
25
|
+
|
26
|
+
node.css('img').each { |img| img.content = " #{img['alt'] || Absolutely.to_abs(base: node.document.url, relative: img['src'])} " }
|
27
|
+
|
28
|
+
node
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
module Parsers
|
3
|
+
class DateTimeParser
|
4
|
+
# @see Value Class Pattern section 4.2
|
5
|
+
# @see http://microformats.org/wiki/value-class-pattern#Date_and_time_parsing
|
6
|
+
#
|
7
|
+
# Regexp pattern matching YYYY-MM-DD and YYY-DDD
|
8
|
+
DATE_REGEXP_PATTERN = '(?<year>\d{4})-((?<ordinal>3[0-6]{2}|[0-2]\d{2})|(?<month>0\d|1[0-2])-(?<day>3[0-1]|[0-2]\d))'.freeze
|
9
|
+
# Regexp pattern matching HH:MM and HH:MM:SS
|
10
|
+
TIME_REGEXP_PATTERN = '(?<hours>2[0-3]|[0-1]?\d)(?::(?<minutes>[0-5]\d))?(?::(?<seconds>[0-5]\d))?(?:\s*?(?<abbreviation>[apPP]\.?[mM]\.?))?'.freeze
|
11
|
+
# Regexp pattern matching +/-(XX:YY|XXYY|XX) or the literal string Z
|
12
|
+
TIMEZONE_REGEXP_PATTERN = '(?<zulu>Z)|(?<offset>(?:\+|-)(?:1[0-2]|0?\d)(?::?[0-5]\d)?)'.freeze
|
13
|
+
|
14
|
+
CAPTURE_NAMES = [:year, :ordinal, :month, :day, :hours, :minutes, :seconds, :abbreviation, :zulu, :offset].freeze
|
15
|
+
|
16
|
+
# @param string [String]
|
17
|
+
def initialize(string)
|
18
|
+
@string = string
|
19
|
+
end
|
20
|
+
|
21
|
+
CAPTURE_NAMES.each do |name|
|
22
|
+
define_method(name) { values[name] }
|
23
|
+
define_method("#{name}?") { public_send(name).present? }
|
24
|
+
end
|
25
|
+
|
26
|
+
def normalized_calendar_date
|
27
|
+
@normalized_calendar_date ||= "#{year}-#{month}-#{day}" if year? && month? && day?
|
28
|
+
end
|
29
|
+
|
30
|
+
def normalized_date
|
31
|
+
@normalized_date ||= normalized_calendar_date || normalized_ordinal_date
|
32
|
+
end
|
33
|
+
|
34
|
+
def normalized_hours
|
35
|
+
@normalized_hours ||= begin
|
36
|
+
return unless hours?
|
37
|
+
return (hours.to_i + 12).to_s if abbreviation&.tr('.', '')&.downcase == 'pm'
|
38
|
+
|
39
|
+
format('%<hours>02d', hours: hours)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def normalized_minutes
|
44
|
+
@normalized_minutes ||= minutes || '00'
|
45
|
+
end
|
46
|
+
|
47
|
+
def normalized_ordinal_date
|
48
|
+
@normalized_ordinal_date ||= "#{year}-#{ordinal}" if year? && ordinal?
|
49
|
+
end
|
50
|
+
|
51
|
+
def normalized_time
|
52
|
+
@normalized_time ||= [normalized_hours, normalized_minutes, seconds].compact.join(':') if normalized_hours
|
53
|
+
end
|
54
|
+
|
55
|
+
def normalized_timezone
|
56
|
+
@normalized_timezone ||= zulu || offset&.tr(':', '')
|
57
|
+
end
|
58
|
+
|
59
|
+
# @return [String]
|
60
|
+
def value
|
61
|
+
@value ||= "#{normalized_date} #{normalized_time}#{normalized_timezone}".strip
|
62
|
+
end
|
63
|
+
|
64
|
+
# @return [Boolean]
|
65
|
+
def value?
|
66
|
+
value.present?
|
67
|
+
end
|
68
|
+
|
69
|
+
# @return [Hash{Symbol => String, nil}]
|
70
|
+
def values
|
71
|
+
@values ||= self.class.values_from(string)
|
72
|
+
end
|
73
|
+
|
74
|
+
# @param string [String]
|
75
|
+
# @return [Hash{Symbol => String, nil}]
|
76
|
+
def self.values_from(string)
|
77
|
+
string&.match(/^(?:#{DATE_REGEXP_PATTERN})?(?:\s?#{TIME_REGEXP_PATTERN}(?:#{TIMEZONE_REGEXP_PATTERN})?)?$/)&.named_captures.to_h.symbolize_keys
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
attr_reader :string
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
module Parsers
|
3
|
+
class DateTimePropertyParser < BasePropertyParser
|
4
|
+
# @see microformats2 Parsing Specification section 1.3.3
|
5
|
+
# @see http://microformats.org/wiki/microformats2-parsing#parsing_a_dt-_property
|
6
|
+
HTML_ATTRIBUTES_MAP = {
|
7
|
+
'datetime' => %w[del ins time],
|
8
|
+
'title' => %w[abbr],
|
9
|
+
'value' => %w[data input]
|
10
|
+
}.freeze
|
11
|
+
|
12
|
+
# @return [String]
|
13
|
+
def value
|
14
|
+
@value ||= begin
|
15
|
+
return resolved_value if date_time_parser.value?
|
16
|
+
return attribute_values.first if attribute_values.any?
|
17
|
+
|
18
|
+
super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
# @return [MicroMicro::Parsers::DateTimeParser, nil]
|
25
|
+
def adopted_date_time
|
26
|
+
@adopted_date_time ||= begin
|
27
|
+
collections = property.collection.select { |prop| prop.prefix == 'dt' }.split(property)
|
28
|
+
|
29
|
+
(collections.shift.reverse + collections).flatten.map { |prop| DateTimeParser.new(prop.value) }.find(&:normalized_date)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Array<String>]
|
34
|
+
def attribute_values
|
35
|
+
@attribute_values ||= begin
|
36
|
+
HTML_ATTRIBUTES_MAP.map do |attribute, names|
|
37
|
+
node[attribute] if names.include?(node.name) && node[attribute]
|
38
|
+
end.compact
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [MicroMicro::Parsers::DateTimeParser]
|
43
|
+
def date_time_parser
|
44
|
+
@date_time_parser ||= DateTimeParser.new(value_class_pattern_parser.value)
|
45
|
+
end
|
46
|
+
|
47
|
+
# @return [Boolean]
|
48
|
+
def imply_date?
|
49
|
+
date_time_parser.normalized_time && !date_time_parser.normalized_date
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [String]
|
53
|
+
def resolved_value
|
54
|
+
return "#{adopted_date_time.normalized_date} #{date_time_parser.value}" if imply_date? && adopted_date_time
|
55
|
+
|
56
|
+
date_time_parser.value
|
57
|
+
end
|
58
|
+
|
59
|
+
# @return [MicroMicro::Parsers::ValueClassPatternParser]
|
60
|
+
def value_class_pattern_parser
|
61
|
+
ValueClassPatternParser.new(node, ' ')
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module MicroMicro
|
2
|
+
module Parsers
|
3
|
+
class EmbeddedMarkupPropertyParser < BasePropertyParser
|
4
|
+
HTML_ATTRIBUTE_NAMES = %w[action cite code codebase data href poster src].freeze
|
5
|
+
|
6
|
+
def value
|
7
|
+
@value ||= begin
|
8
|
+
{
|
9
|
+
html: resolved_node.inner_html.strip,
|
10
|
+
value: super
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def resolved_node
|
18
|
+
@resolved_node ||= begin
|
19
|
+
HTML_ATTRIBUTE_NAMES.each do |attribute|
|
20
|
+
node.css("[#{attribute}]").each { |element| element[attribute] = Absolutely.to_abs(base: node.document.url, relative: element[attribute].strip) }
|
21
|
+
end
|
22
|
+
|
23
|
+
node
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|