micromicro 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.editorconfig +14 -0
  3. data/.gitignore +34 -0
  4. data/.gitmodules +3 -0
  5. data/.reek.yml +8 -0
  6. data/.rspec +2 -0
  7. data/.rubocop +3 -0
  8. data/.rubocop.yml +25 -0
  9. data/.ruby-version +1 -0
  10. data/.simplecov +11 -0
  11. data/.travis.yml +19 -0
  12. data/CHANGELOG.md +5 -0
  13. data/CONTRIBUTING.md +37 -0
  14. data/Gemfile +14 -0
  15. data/LICENSE +21 -0
  16. data/README.md +122 -0
  17. data/Rakefile +18 -0
  18. data/lib/micro_micro/collections/base_collection.rb +37 -0
  19. data/lib/micro_micro/collections/items_collection.rb +10 -0
  20. data/lib/micro_micro/collections/properties_collection.rb +18 -0
  21. data/lib/micro_micro/collections/relations_collection.rb +23 -0
  22. data/lib/micro_micro/document.rb +71 -0
  23. data/lib/micro_micro/implied_property.rb +25 -0
  24. data/lib/micro_micro/item.rb +151 -0
  25. data/lib/micro_micro/parsers/base_property_parser.rb +33 -0
  26. data/lib/micro_micro/parsers/date_time_parser.rb +85 -0
  27. data/lib/micro_micro/parsers/date_time_property_parser.rb +65 -0
  28. data/lib/micro_micro/parsers/embedded_markup_property_parser.rb +28 -0
  29. data/lib/micro_micro/parsers/implied_name_property_parser.rb +78 -0
  30. data/lib/micro_micro/parsers/implied_photo_property_parser.rb +69 -0
  31. data/lib/micro_micro/parsers/implied_url_property_parser.rb +61 -0
  32. data/lib/micro_micro/parsers/plain_text_property_parser.rb +39 -0
  33. data/lib/micro_micro/parsers/url_property_parser.rb +75 -0
  34. data/lib/micro_micro/parsers/value_class_pattern_parser.rb +92 -0
  35. data/lib/micro_micro/property.rb +116 -0
  36. data/lib/micro_micro/relation.rb +78 -0
  37. data/lib/micro_micro/version.rb +3 -0
  38. data/lib/micromicro.rb +39 -0
  39. data/micromicro.gemspec +28 -0
  40. metadata +128 -0
@@ -0,0 +1,71 @@
1
+ module MicroMicro
2
+ class Document
3
+ # @param markup [String] the HTML to parse
4
+ # @param base_url [String] the URL associated with the provided markup
5
+ def initialize(markup, base_url)
6
+ @markup = markup
7
+ @base_url = base_url
8
+ end
9
+
10
+ # @return [String]
11
+ def inspect
12
+ format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relations: #{relations.inspect}>), object_id)
13
+ end
14
+
15
+ # @return [MicroMicro::Collections::ItemsCollection]
16
+ def items
17
+ @items ||= Collections::ItemsCollection.new(Item.items_from(document))
18
+ end
19
+
20
+ # @return [MicroMicro::Collections::RelationsCollection]
21
+ def relations
22
+ @relations ||= Collections::RelationsCollection.new(Relation.relations_from(document))
23
+ end
24
+
25
+ # @see microformats2 Parsing Specification section 1.1
26
+ # @see http://microformats.org/wiki/microformats2-parsing#parse_a_document_for_microformats
27
+ #
28
+ # @return [Hash]
29
+ def to_h
30
+ {
31
+ items: items.to_a,
32
+ rels: relations.group_by_rel,
33
+ 'rel-urls': relations.group_by_url
34
+ }
35
+ end
36
+
37
+ # @param node [Nokogiri::XML::Element]
38
+ # @return [Boolean]
39
+ def self.ignore_node?(node)
40
+ ignored_node_names.include?(node.name)
41
+ end
42
+
43
+ # @return [Array<String>]
44
+ def self.ignored_node_names
45
+ %w[script style template]
46
+ end
47
+
48
+ private
49
+
50
+ attr_reader :base_url, :markup
51
+
52
+ # @return [Nokogiri::XML::Element, nil]
53
+ def base_element
54
+ @base_element ||= Nokogiri::HTML(markup).at_css('base[href]')
55
+ end
56
+
57
+ # @return [Nokogiri::HTML::Document]
58
+ def document
59
+ @document ||= Nokogiri::HTML(markup, resolved_base_url)
60
+ end
61
+
62
+ # @return [String]
63
+ def resolved_base_url
64
+ @resolved_base_url ||= begin
65
+ return base_url unless base_element
66
+
67
+ Absolutely.to_abs(base: base_url, relative: base_element['href'])
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,25 @@
1
+ module MicroMicro
2
+ class ImpliedProperty < Property
3
+ IMPLIED_PROPERTY_PARSERS_MAP = {
4
+ 'name' => Parsers::ImpliedNamePropertyParser,
5
+ 'photo' => Parsers::ImpliedPhotoPropertyParser,
6
+ 'url' => Parsers::ImpliedUrlPropertyParser
7
+ }.freeze
8
+
9
+ # @return [Boolean]
10
+ def implied?
11
+ true
12
+ end
13
+
14
+ # @return [Boolean]
15
+ def item_node?
16
+ false
17
+ end
18
+
19
+ private
20
+
21
+ def parser
22
+ @parser ||= IMPLIED_PROPERTY_PARSERS_MAP[name].new(self)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,151 @@
1
+ module MicroMicro
2
+ class Item
3
+ attr_accessor :value
4
+
5
+ # @param node [Nokogiri::XML::Element]
6
+ def initialize(node)
7
+ @node = node
8
+
9
+ properties << implied_name if implied_name?
10
+ properties << implied_photo if implied_photo?
11
+ properties << implied_url if implied_url?
12
+ end
13
+
14
+ # @return [MicroMicro::Collections::ItemsCollection]
15
+ def children
16
+ @children ||= Collections::ItemsCollection.new(Item.items_from(node.element_children))
17
+ end
18
+
19
+ # @return [String, nil]
20
+ def id
21
+ @id ||= node['id']&.strip
22
+ end
23
+
24
+ # @return [String]
25
+ def inspect
26
+ format(%(#<#{self.class.name}:%#0x types: #{types.inspect}, properties: #{properties.count}, children: #{children.count}>), object_id)
27
+ end
28
+
29
+ # @return [MicroMicro::Collections::PropertiesCollection]
30
+ def properties
31
+ @properties ||= Collections::PropertiesCollection.new(Property.properties_from(node.element_children))
32
+ end
33
+
34
+ # @see microformats2 Parsing Specification section 1.2
35
+ # @see http://microformats.org/wiki/microformats2-parsing#parse_an_element_for_class_microformats
36
+ #
37
+ # @return [Hash]
38
+ def to_h
39
+ hash = {
40
+ type: types,
41
+ properties: properties.to_h
42
+ }
43
+
44
+ hash[:id] = id if id.present?
45
+ hash[:children] = children.to_a if children.any?
46
+ hash[:value] = value if value.present?
47
+
48
+ hash
49
+ end
50
+
51
+ # @return [Array<String>]
52
+ def types
53
+ @types ||= self.class.types_from(node)
54
+ end
55
+
56
+ # @param node [Nokogiri::XML::Element]
57
+ # @return [Boolean]
58
+ def self.item_node?(node)
59
+ types_from(node).any?
60
+ end
61
+
62
+ # @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
63
+ # @return [Array<MicroMicro::Item>]
64
+ def self.items_from(context)
65
+ nodes_from(context).map { |node| new(node) }
66
+ end
67
+
68
+ # @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
69
+ # @param node_set [Nokogiri::XML::NodeSet]
70
+ # @return [Nokogiri::XML::NodeSet]
71
+ def self.nodes_from(context, node_set = Nokogiri::XML::NodeSet.new(context.document, []))
72
+ return nodes_from(context.element_children, node_set) if context.is_a?(Nokogiri::HTML::Document)
73
+
74
+ context.each { |node| nodes_from(node, node_set) } if context.is_a?(Nokogiri::XML::NodeSet)
75
+
76
+ if context.is_a?(Nokogiri::XML::Element) && !Document.ignore_node?(context)
77
+ if item_node?(context)
78
+ node_set << context unless Property.property_node?(context)
79
+ else
80
+ nodes_from(context.element_children, node_set)
81
+ end
82
+ end
83
+
84
+ node_set
85
+ end
86
+
87
+ # @param node [Nokogiri::XML::Element]
88
+ # @return [Array<String>]
89
+ #
90
+ # @example
91
+ # node = Nokogiri::HTML('<div class="h-card">Jason Garber</div>').at_css('div')
92
+ # MicroMicro::Item.types_from(node) #=> ['h-card']
93
+ def self.types_from(node)
94
+ node.classes.select { |token| token.match?(/^h(?:\-[0-9a-z]+)?(?:\-[a-z]+)+$/) }.uniq.sort
95
+ end
96
+
97
+ private
98
+
99
+ attr_reader :node
100
+
101
+ # @return [MicroMicro::ImpliedProperty]
102
+ def implied_name
103
+ @implied_name ||= ImpliedProperty.new(node, name: 'name', prefix: 'p')
104
+ end
105
+
106
+ # @return [Boolean]
107
+ def implied_name?
108
+ imply_name? && implied_name.value?
109
+ end
110
+
111
+ # @return [MicroMicro::ImpliedProperty]
112
+ def implied_photo
113
+ @implied_photo ||= ImpliedProperty.new(node, name: 'photo', prefix: 'u')
114
+ end
115
+
116
+ # @return [Boolean]
117
+ def implied_photo?
118
+ imply_photo? && implied_photo.value?
119
+ end
120
+
121
+ # @return [MicroMicro::ImpliedProperty]
122
+ def implied_url
123
+ @implied_url ||= ImpliedProperty.new(node, name: 'url', prefix: 'u')
124
+ end
125
+
126
+ # @return [Boolean]
127
+ def implied_url?
128
+ imply_url? && implied_url.value?
129
+ end
130
+
131
+ # @return [Boolean]
132
+ def imply_name?
133
+ properties.none? { |prop| prop.name == 'name' } && properties.none? { |prop| %w[e p].include?(prop.prefix) } && !nested_items?
134
+ end
135
+
136
+ # @return [Boolean]
137
+ def imply_photo?
138
+ properties.none? { |prop| prop.name == 'photo' } && properties.reject(&:implied?).none? { |prop| prop.prefix == 'u' } && !nested_items?
139
+ end
140
+
141
+ # @return [Boolean]
142
+ def imply_url?
143
+ properties.none? { |prop| prop.name == 'url' } && properties.reject(&:implied?).none? { |prop| prop.prefix == 'u' } && !nested_items?
144
+ end
145
+
146
+ # @return [Boolean]
147
+ def nested_items?
148
+ @nested_items ||= properties.find(&:item_node?) || children.any?
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,33 @@
1
+ module MicroMicro
2
+ module Parsers
3
+ class BasePropertyParser
4
+ # @param property [MicroMicro::Property, MicroMicro::ImpliedProperty]
5
+ def initialize(property)
6
+ @property = property
7
+ @node = property.node
8
+ end
9
+
10
+ # @return [String]
11
+ def value
12
+ @value ||= serialized_node.text.strip
13
+ end
14
+
15
+ private
16
+
17
+ attr_reader :node, :property
18
+
19
+ # @see microformats2 Parsing Specification sections 1.3.1 and 1.3.4
20
+ # @see http://microformats.org/wiki/microformats2-parsing#parsing_a_p-_property
21
+ # @see http://microformats.org/wiki/microformats2-parsing#parsing_an_e-_property
22
+ def serialized_node
23
+ @serialized_node ||= begin
24
+ node.css(*Document.ignored_node_names).unlink
25
+
26
+ node.css('img').each { |img| img.content = " #{img['alt'] || Absolutely.to_abs(base: node.document.url, relative: img['src'])} " }
27
+
28
+ node
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,85 @@
1
+ module MicroMicro
2
+ module Parsers
3
+ class DateTimeParser
4
+ # @see Value Class Pattern section 4.2
5
+ # @see http://microformats.org/wiki/value-class-pattern#Date_and_time_parsing
6
+ #
7
+ # Regexp pattern matching YYYY-MM-DD and YYY-DDD
8
+ DATE_REGEXP_PATTERN = '(?<year>\d{4})-((?<ordinal>3[0-6]{2}|[0-2]\d{2})|(?<month>0\d|1[0-2])-(?<day>3[0-1]|[0-2]\d))'.freeze
9
+ # Regexp pattern matching HH:MM and HH:MM:SS
10
+ TIME_REGEXP_PATTERN = '(?<hours>2[0-3]|[0-1]?\d)(?::(?<minutes>[0-5]\d))?(?::(?<seconds>[0-5]\d))?(?:\s*?(?<abbreviation>[apPP]\.?[mM]\.?))?'.freeze
11
+ # Regexp pattern matching +/-(XX:YY|XXYY|XX) or the literal string Z
12
+ TIMEZONE_REGEXP_PATTERN = '(?<zulu>Z)|(?<offset>(?:\+|-)(?:1[0-2]|0?\d)(?::?[0-5]\d)?)'.freeze
13
+
14
+ CAPTURE_NAMES = [:year, :ordinal, :month, :day, :hours, :minutes, :seconds, :abbreviation, :zulu, :offset].freeze
15
+
16
+ # @param string [String]
17
+ def initialize(string)
18
+ @string = string
19
+ end
20
+
21
+ CAPTURE_NAMES.each do |name|
22
+ define_method(name) { values[name] }
23
+ define_method("#{name}?") { public_send(name).present? }
24
+ end
25
+
26
+ def normalized_calendar_date
27
+ @normalized_calendar_date ||= "#{year}-#{month}-#{day}" if year? && month? && day?
28
+ end
29
+
30
+ def normalized_date
31
+ @normalized_date ||= normalized_calendar_date || normalized_ordinal_date
32
+ end
33
+
34
+ def normalized_hours
35
+ @normalized_hours ||= begin
36
+ return unless hours?
37
+ return (hours.to_i + 12).to_s if abbreviation&.tr('.', '')&.downcase == 'pm'
38
+
39
+ format('%<hours>02d', hours: hours)
40
+ end
41
+ end
42
+
43
+ def normalized_minutes
44
+ @normalized_minutes ||= minutes || '00'
45
+ end
46
+
47
+ def normalized_ordinal_date
48
+ @normalized_ordinal_date ||= "#{year}-#{ordinal}" if year? && ordinal?
49
+ end
50
+
51
+ def normalized_time
52
+ @normalized_time ||= [normalized_hours, normalized_minutes, seconds].compact.join(':') if normalized_hours
53
+ end
54
+
55
+ def normalized_timezone
56
+ @normalized_timezone ||= zulu || offset&.tr(':', '')
57
+ end
58
+
59
+ # @return [String]
60
+ def value
61
+ @value ||= "#{normalized_date} #{normalized_time}#{normalized_timezone}".strip
62
+ end
63
+
64
+ # @return [Boolean]
65
+ def value?
66
+ value.present?
67
+ end
68
+
69
+ # @return [Hash{Symbol => String, nil}]
70
+ def values
71
+ @values ||= self.class.values_from(string)
72
+ end
73
+
74
+ # @param string [String]
75
+ # @return [Hash{Symbol => String, nil}]
76
+ def self.values_from(string)
77
+ string&.match(/^(?:#{DATE_REGEXP_PATTERN})?(?:\s?#{TIME_REGEXP_PATTERN}(?:#{TIMEZONE_REGEXP_PATTERN})?)?$/)&.named_captures.to_h.symbolize_keys
78
+ end
79
+
80
+ private
81
+
82
+ attr_reader :string
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,65 @@
1
+ module MicroMicro
2
+ module Parsers
3
+ class DateTimePropertyParser < BasePropertyParser
4
+ # @see microformats2 Parsing Specification section 1.3.3
5
+ # @see http://microformats.org/wiki/microformats2-parsing#parsing_a_dt-_property
6
+ HTML_ATTRIBUTES_MAP = {
7
+ 'datetime' => %w[del ins time],
8
+ 'title' => %w[abbr],
9
+ 'value' => %w[data input]
10
+ }.freeze
11
+
12
+ # @return [String]
13
+ def value
14
+ @value ||= begin
15
+ return resolved_value if date_time_parser.value?
16
+ return attribute_values.first if attribute_values.any?
17
+
18
+ super
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ # @return [MicroMicro::Parsers::DateTimeParser, nil]
25
+ def adopted_date_time
26
+ @adopted_date_time ||= begin
27
+ collections = property.collection.select { |prop| prop.prefix == 'dt' }.split(property)
28
+
29
+ (collections.shift.reverse + collections).flatten.map { |prop| DateTimeParser.new(prop.value) }.find(&:normalized_date)
30
+ end
31
+ end
32
+
33
+ # @return [Array<String>]
34
+ def attribute_values
35
+ @attribute_values ||= begin
36
+ HTML_ATTRIBUTES_MAP.map do |attribute, names|
37
+ node[attribute] if names.include?(node.name) && node[attribute]
38
+ end.compact
39
+ end
40
+ end
41
+
42
+ # @return [MicroMicro::Parsers::DateTimeParser]
43
+ def date_time_parser
44
+ @date_time_parser ||= DateTimeParser.new(value_class_pattern_parser.value)
45
+ end
46
+
47
+ # @return [Boolean]
48
+ def imply_date?
49
+ date_time_parser.normalized_time && !date_time_parser.normalized_date
50
+ end
51
+
52
+ # @return [String]
53
+ def resolved_value
54
+ return "#{adopted_date_time.normalized_date} #{date_time_parser.value}" if imply_date? && adopted_date_time
55
+
56
+ date_time_parser.value
57
+ end
58
+
59
+ # @return [MicroMicro::Parsers::ValueClassPatternParser]
60
+ def value_class_pattern_parser
61
+ ValueClassPatternParser.new(node, ' ')
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,28 @@
1
+ module MicroMicro
2
+ module Parsers
3
+ class EmbeddedMarkupPropertyParser < BasePropertyParser
4
+ HTML_ATTRIBUTE_NAMES = %w[action cite code codebase data href poster src].freeze
5
+
6
+ def value
7
+ @value ||= begin
8
+ {
9
+ html: resolved_node.inner_html.strip,
10
+ value: super
11
+ }
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def resolved_node
18
+ @resolved_node ||= begin
19
+ HTML_ATTRIBUTE_NAMES.each do |attribute|
20
+ node.css("[#{attribute}]").each { |element| element[attribute] = Absolutely.to_abs(base: node.document.url, relative: element[attribute].strip) }
21
+ end
22
+
23
+ node
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end