micromicro 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.editorconfig +14 -0
  3. data/.gitignore +34 -0
  4. data/.gitmodules +3 -0
  5. data/.reek.yml +8 -0
  6. data/.rspec +2 -0
  7. data/.rubocop +3 -0
  8. data/.rubocop.yml +25 -0
  9. data/.ruby-version +1 -0
  10. data/.simplecov +11 -0
  11. data/.travis.yml +19 -0
  12. data/CHANGELOG.md +5 -0
  13. data/CONTRIBUTING.md +37 -0
  14. data/Gemfile +14 -0
  15. data/LICENSE +21 -0
  16. data/README.md +122 -0
  17. data/Rakefile +18 -0
  18. data/lib/micro_micro/collections/base_collection.rb +37 -0
  19. data/lib/micro_micro/collections/items_collection.rb +10 -0
  20. data/lib/micro_micro/collections/properties_collection.rb +18 -0
  21. data/lib/micro_micro/collections/relations_collection.rb +23 -0
  22. data/lib/micro_micro/document.rb +71 -0
  23. data/lib/micro_micro/implied_property.rb +25 -0
  24. data/lib/micro_micro/item.rb +151 -0
  25. data/lib/micro_micro/parsers/base_property_parser.rb +33 -0
  26. data/lib/micro_micro/parsers/date_time_parser.rb +85 -0
  27. data/lib/micro_micro/parsers/date_time_property_parser.rb +65 -0
  28. data/lib/micro_micro/parsers/embedded_markup_property_parser.rb +28 -0
  29. data/lib/micro_micro/parsers/implied_name_property_parser.rb +78 -0
  30. data/lib/micro_micro/parsers/implied_photo_property_parser.rb +69 -0
  31. data/lib/micro_micro/parsers/implied_url_property_parser.rb +61 -0
  32. data/lib/micro_micro/parsers/plain_text_property_parser.rb +39 -0
  33. data/lib/micro_micro/parsers/url_property_parser.rb +75 -0
  34. data/lib/micro_micro/parsers/value_class_pattern_parser.rb +92 -0
  35. data/lib/micro_micro/property.rb +116 -0
  36. data/lib/micro_micro/relation.rb +78 -0
  37. data/lib/micro_micro/version.rb +3 -0
  38. data/lib/micromicro.rb +39 -0
  39. data/micromicro.gemspec +28 -0
  40. metadata +128 -0
@@ -0,0 +1,71 @@
1
+ module MicroMicro
2
+ class Document
3
+ # @param markup [String] the HTML to parse
4
+ # @param base_url [String] the URL associated with the provided markup
5
+ def initialize(markup, base_url)
6
+ @markup = markup
7
+ @base_url = base_url
8
+ end
9
+
10
+ # @return [String]
11
+ def inspect
12
+ format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relations: #{relations.inspect}>), object_id)
13
+ end
14
+
15
+ # @return [MicroMicro::Collections::ItemsCollection]
16
+ def items
17
+ @items ||= Collections::ItemsCollection.new(Item.items_from(document))
18
+ end
19
+
20
+ # @return [MicroMicro::Collections::RelationsCollection]
21
+ def relations
22
+ @relations ||= Collections::RelationsCollection.new(Relation.relations_from(document))
23
+ end
24
+
25
+ # @see microformats2 Parsing Specification section 1.1
26
+ # @see http://microformats.org/wiki/microformats2-parsing#parse_a_document_for_microformats
27
+ #
28
+ # @return [Hash]
29
+ def to_h
30
+ {
31
+ items: items.to_a,
32
+ rels: relations.group_by_rel,
33
+ 'rel-urls': relations.group_by_url
34
+ }
35
+ end
36
+
37
+ # @param node [Nokogiri::XML::Element]
38
+ # @return [Boolean]
39
+ def self.ignore_node?(node)
40
+ ignored_node_names.include?(node.name)
41
+ end
42
+
43
+ # @return [Array<String>]
44
+ def self.ignored_node_names
45
+ %w[script style template]
46
+ end
47
+
48
+ private
49
+
50
+ attr_reader :base_url, :markup
51
+
52
+ # @return [Nokogiri::XML::Element, nil]
53
+ def base_element
54
+ @base_element ||= Nokogiri::HTML(markup).at_css('base[href]')
55
+ end
56
+
57
+ # @return [Nokogiri::HTML::Document]
58
+ def document
59
+ @document ||= Nokogiri::HTML(markup, resolved_base_url)
60
+ end
61
+
62
+ # @return [String]
63
+ def resolved_base_url
64
+ @resolved_base_url ||= begin
65
+ return base_url unless base_element
66
+
67
+ Absolutely.to_abs(base: base_url, relative: base_element['href'])
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,25 @@
1
+ module MicroMicro
2
+ class ImpliedProperty < Property
3
+ IMPLIED_PROPERTY_PARSERS_MAP = {
4
+ 'name' => Parsers::ImpliedNamePropertyParser,
5
+ 'photo' => Parsers::ImpliedPhotoPropertyParser,
6
+ 'url' => Parsers::ImpliedUrlPropertyParser
7
+ }.freeze
8
+
9
+ # @return [Boolean]
10
+ def implied?
11
+ true
12
+ end
13
+
14
+ # @return [Boolean]
15
+ def item_node?
16
+ false
17
+ end
18
+
19
+ private
20
+
21
+ def parser
22
+ @parser ||= IMPLIED_PROPERTY_PARSERS_MAP[name].new(self)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,151 @@
1
+ module MicroMicro
2
+ class Item
3
+ attr_accessor :value
4
+
5
+ # @param node [Nokogiri::XML::Element]
6
+ def initialize(node)
7
+ @node = node
8
+
9
+ properties << implied_name if implied_name?
10
+ properties << implied_photo if implied_photo?
11
+ properties << implied_url if implied_url?
12
+ end
13
+
14
+ # @return [MicroMicro::Collections::ItemsCollection]
15
+ def children
16
+ @children ||= Collections::ItemsCollection.new(Item.items_from(node.element_children))
17
+ end
18
+
19
+ # @return [String, nil]
20
+ def id
21
+ @id ||= node['id']&.strip
22
+ end
23
+
24
+ # @return [String]
25
+ def inspect
26
+ format(%(#<#{self.class.name}:%#0x types: #{types.inspect}, properties: #{properties.count}, children: #{children.count}>), object_id)
27
+ end
28
+
29
+ # @return [MicroMicro::Collections::PropertiesCollection]
30
+ def properties
31
+ @properties ||= Collections::PropertiesCollection.new(Property.properties_from(node.element_children))
32
+ end
33
+
34
+ # @see microformats2 Parsing Specification section 1.2
35
+ # @see http://microformats.org/wiki/microformats2-parsing#parse_an_element_for_class_microformats
36
+ #
37
+ # @return [Hash]
38
+ def to_h
39
+ hash = {
40
+ type: types,
41
+ properties: properties.to_h
42
+ }
43
+
44
+ hash[:id] = id if id.present?
45
+ hash[:children] = children.to_a if children.any?
46
+ hash[:value] = value if value.present?
47
+
48
+ hash
49
+ end
50
+
51
+ # @return [Array<String>]
52
+ def types
53
+ @types ||= self.class.types_from(node)
54
+ end
55
+
56
+ # @param node [Nokogiri::XML::Element]
57
+ # @return [Boolean]
58
+ def self.item_node?(node)
59
+ types_from(node).any?
60
+ end
61
+
62
+ # @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
63
+ # @return [Array<MicroMicro::Item>]
64
+ def self.items_from(context)
65
+ nodes_from(context).map { |node| new(node) }
66
+ end
67
+
68
+ # @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
69
+ # @param node_set [Nokogiri::XML::NodeSet]
70
+ # @return [Nokogiri::XML::NodeSet]
71
+ def self.nodes_from(context, node_set = Nokogiri::XML::NodeSet.new(context.document, []))
72
+ return nodes_from(context.element_children, node_set) if context.is_a?(Nokogiri::HTML::Document)
73
+
74
+ context.each { |node| nodes_from(node, node_set) } if context.is_a?(Nokogiri::XML::NodeSet)
75
+
76
+ if context.is_a?(Nokogiri::XML::Element) && !Document.ignore_node?(context)
77
+ if item_node?(context)
78
+ node_set << context unless Property.property_node?(context)
79
+ else
80
+ nodes_from(context.element_children, node_set)
81
+ end
82
+ end
83
+
84
+ node_set
85
+ end
86
+
87
+ # @param node [Nokogiri::XML::Element]
88
+ # @return [Array<String>]
89
+ #
90
+ # @example
91
+ # node = Nokogiri::HTML('<div class="h-card">Jason Garber</div>').at_css('div')
92
+ # MicroMicro::Item.types_from(node) #=> ['h-card']
93
+ def self.types_from(node)
94
+ node.classes.select { |token| token.match?(/^h(?:\-[0-9a-z]+)?(?:\-[a-z]+)+$/) }.uniq.sort
95
+ end
96
+
97
+ private
98
+
99
+ attr_reader :node
100
+
101
+ # @return [MicroMicro::ImpliedProperty]
102
+ def implied_name
103
+ @implied_name ||= ImpliedProperty.new(node, name: 'name', prefix: 'p')
104
+ end
105
+
106
+ # @return [Boolean]
107
+ def implied_name?
108
+ imply_name? && implied_name.value?
109
+ end
110
+
111
+ # @return [MicroMicro::ImpliedProperty]
112
+ def implied_photo
113
+ @implied_photo ||= ImpliedProperty.new(node, name: 'photo', prefix: 'u')
114
+ end
115
+
116
+ # @return [Boolean]
117
+ def implied_photo?
118
+ imply_photo? && implied_photo.value?
119
+ end
120
+
121
+ # @return [MicroMicro::ImpliedProperty]
122
+ def implied_url
123
+ @implied_url ||= ImpliedProperty.new(node, name: 'url', prefix: 'u')
124
+ end
125
+
126
+ # @return [Boolean]
127
+ def implied_url?
128
+ imply_url? && implied_url.value?
129
+ end
130
+
131
+ # @return [Boolean]
132
+ def imply_name?
133
+ properties.none? { |prop| prop.name == 'name' } && properties.none? { |prop| %w[e p].include?(prop.prefix) } && !nested_items?
134
+ end
135
+
136
+ # @return [Boolean]
137
+ def imply_photo?
138
+ properties.none? { |prop| prop.name == 'photo' } && properties.reject(&:implied?).none? { |prop| prop.prefix == 'u' } && !nested_items?
139
+ end
140
+
141
+ # @return [Boolean]
142
+ def imply_url?
143
+ properties.none? { |prop| prop.name == 'url' } && properties.reject(&:implied?).none? { |prop| prop.prefix == 'u' } && !nested_items?
144
+ end
145
+
146
+ # @return [Boolean]
147
+ def nested_items?
148
+ @nested_items ||= properties.find(&:item_node?) || children.any?
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,33 @@
1
+ module MicroMicro
2
+ module Parsers
3
+ class BasePropertyParser
4
+ # @param property [MicroMicro::Property, MicroMicro::ImpliedProperty]
5
+ def initialize(property)
6
+ @property = property
7
+ @node = property.node
8
+ end
9
+
10
+ # @return [String]
11
+ def value
12
+ @value ||= serialized_node.text.strip
13
+ end
14
+
15
+ private
16
+
17
+ attr_reader :node, :property
18
+
19
+ # @see microformats2 Parsing Specification sections 1.3.1 and 1.3.4
20
+ # @see http://microformats.org/wiki/microformats2-parsing#parsing_a_p-_property
21
+ # @see http://microformats.org/wiki/microformats2-parsing#parsing_an_e-_property
22
+ def serialized_node
23
+ @serialized_node ||= begin
24
+ node.css(*Document.ignored_node_names).unlink
25
+
26
+ node.css('img').each { |img| img.content = " #{img['alt'] || Absolutely.to_abs(base: node.document.url, relative: img['src'])} " }
27
+
28
+ node
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,85 @@
1
+ module MicroMicro
2
+ module Parsers
3
+ class DateTimeParser
4
+ # @see Value Class Pattern section 4.2
5
+ # @see http://microformats.org/wiki/value-class-pattern#Date_and_time_parsing
6
+ #
7
+ # Regexp pattern matching YYYY-MM-DD and YYY-DDD
8
+ DATE_REGEXP_PATTERN = '(?<year>\d{4})-((?<ordinal>3[0-6]{2}|[0-2]\d{2})|(?<month>0\d|1[0-2])-(?<day>3[0-1]|[0-2]\d))'.freeze
9
+ # Regexp pattern matching HH:MM and HH:MM:SS
10
+ TIME_REGEXP_PATTERN = '(?<hours>2[0-3]|[0-1]?\d)(?::(?<minutes>[0-5]\d))?(?::(?<seconds>[0-5]\d))?(?:\s*?(?<abbreviation>[apPP]\.?[mM]\.?))?'.freeze
11
+ # Regexp pattern matching +/-(XX:YY|XXYY|XX) or the literal string Z
12
+ TIMEZONE_REGEXP_PATTERN = '(?<zulu>Z)|(?<offset>(?:\+|-)(?:1[0-2]|0?\d)(?::?[0-5]\d)?)'.freeze
13
+
14
+ CAPTURE_NAMES = [:year, :ordinal, :month, :day, :hours, :minutes, :seconds, :abbreviation, :zulu, :offset].freeze
15
+
16
+ # @param string [String]
17
+ def initialize(string)
18
+ @string = string
19
+ end
20
+
21
+ CAPTURE_NAMES.each do |name|
22
+ define_method(name) { values[name] }
23
+ define_method("#{name}?") { public_send(name).present? }
24
+ end
25
+
26
+ def normalized_calendar_date
27
+ @normalized_calendar_date ||= "#{year}-#{month}-#{day}" if year? && month? && day?
28
+ end
29
+
30
+ def normalized_date
31
+ @normalized_date ||= normalized_calendar_date || normalized_ordinal_date
32
+ end
33
+
34
+ def normalized_hours
35
+ @normalized_hours ||= begin
36
+ return unless hours?
37
+ return (hours.to_i + 12).to_s if abbreviation&.tr('.', '')&.downcase == 'pm'
38
+
39
+ format('%<hours>02d', hours: hours)
40
+ end
41
+ end
42
+
43
+ def normalized_minutes
44
+ @normalized_minutes ||= minutes || '00'
45
+ end
46
+
47
+ def normalized_ordinal_date
48
+ @normalized_ordinal_date ||= "#{year}-#{ordinal}" if year? && ordinal?
49
+ end
50
+
51
+ def normalized_time
52
+ @normalized_time ||= [normalized_hours, normalized_minutes, seconds].compact.join(':') if normalized_hours
53
+ end
54
+
55
+ def normalized_timezone
56
+ @normalized_timezone ||= zulu || offset&.tr(':', '')
57
+ end
58
+
59
+ # @return [String]
60
+ def value
61
+ @value ||= "#{normalized_date} #{normalized_time}#{normalized_timezone}".strip
62
+ end
63
+
64
+ # @return [Boolean]
65
+ def value?
66
+ value.present?
67
+ end
68
+
69
+ # @return [Hash{Symbol => String, nil}]
70
+ def values
71
+ @values ||= self.class.values_from(string)
72
+ end
73
+
74
+ # @param string [String]
75
+ # @return [Hash{Symbol => String, nil}]
76
+ def self.values_from(string)
77
+ string&.match(/^(?:#{DATE_REGEXP_PATTERN})?(?:\s?#{TIME_REGEXP_PATTERN}(?:#{TIMEZONE_REGEXP_PATTERN})?)?$/)&.named_captures.to_h.symbolize_keys
78
+ end
79
+
80
+ private
81
+
82
+ attr_reader :string
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,65 @@
1
+ module MicroMicro
2
+ module Parsers
3
+ class DateTimePropertyParser < BasePropertyParser
4
+ # @see microformats2 Parsing Specification section 1.3.3
5
+ # @see http://microformats.org/wiki/microformats2-parsing#parsing_a_dt-_property
6
+ HTML_ATTRIBUTES_MAP = {
7
+ 'datetime' => %w[del ins time],
8
+ 'title' => %w[abbr],
9
+ 'value' => %w[data input]
10
+ }.freeze
11
+
12
+ # @return [String]
13
+ def value
14
+ @value ||= begin
15
+ return resolved_value if date_time_parser.value?
16
+ return attribute_values.first if attribute_values.any?
17
+
18
+ super
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ # @return [MicroMicro::Parsers::DateTimeParser, nil]
25
+ def adopted_date_time
26
+ @adopted_date_time ||= begin
27
+ collections = property.collection.select { |prop| prop.prefix == 'dt' }.split(property)
28
+
29
+ (collections.shift.reverse + collections).flatten.map { |prop| DateTimeParser.new(prop.value) }.find(&:normalized_date)
30
+ end
31
+ end
32
+
33
+ # @return [Array<String>]
34
+ def attribute_values
35
+ @attribute_values ||= begin
36
+ HTML_ATTRIBUTES_MAP.map do |attribute, names|
37
+ node[attribute] if names.include?(node.name) && node[attribute]
38
+ end.compact
39
+ end
40
+ end
41
+
42
+ # @return [MicroMicro::Parsers::DateTimeParser]
43
+ def date_time_parser
44
+ @date_time_parser ||= DateTimeParser.new(value_class_pattern_parser.value)
45
+ end
46
+
47
+ # @return [Boolean]
48
+ def imply_date?
49
+ date_time_parser.normalized_time && !date_time_parser.normalized_date
50
+ end
51
+
52
+ # @return [String]
53
+ def resolved_value
54
+ return "#{adopted_date_time.normalized_date} #{date_time_parser.value}" if imply_date? && adopted_date_time
55
+
56
+ date_time_parser.value
57
+ end
58
+
59
+ # @return [MicroMicro::Parsers::ValueClassPatternParser]
60
+ def value_class_pattern_parser
61
+ ValueClassPatternParser.new(node, ' ')
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,28 @@
1
+ module MicroMicro
2
+ module Parsers
3
+ class EmbeddedMarkupPropertyParser < BasePropertyParser
4
+ HTML_ATTRIBUTE_NAMES = %w[action cite code codebase data href poster src].freeze
5
+
6
+ def value
7
+ @value ||= begin
8
+ {
9
+ html: resolved_node.inner_html.strip,
10
+ value: super
11
+ }
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def resolved_node
18
+ @resolved_node ||= begin
19
+ HTML_ATTRIBUTE_NAMES.each do |attribute|
20
+ node.css("[#{attribute}]").each { |element| element[attribute] = Absolutely.to_abs(base: node.document.url, relative: element[attribute].strip) }
21
+ end
22
+
23
+ node
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end