html_entry 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7d159801477390ab4b8934b0fa0d8db6238ef931a4e3090cbae0e5d735199966
4
+ data.tar.gz: a1a7f1209bbd12744f5ebe2fd54e3bd3d3fd43c556d309fde9a5607c5566cdaf
5
+ SHA512:
6
+ metadata.gz: da0a2352cc43bf9bcb5339cb399303575f85eb9e9bda15aad1db69d20563c903e8bd2f11204741ee65472555e62db160f178c5bcf5280e6decbe5bd2d229ee36
7
+ data.tar.gz: ec7290fbf672fb50459e03de7b0156ab271092a0f194ad5b87fb51f75eef1ccd25e0d630d7f977f5e28d78e661a4fb94a837daebe0ff0268d49b2a9586dcf989
@@ -0,0 +1,4 @@
1
+ module HtmlEntry
2
+ class Error < StandardError
3
+ end
4
+ end
@@ -0,0 +1,198 @@
1
+ require 'nokogiri'
2
+ require_relative '../error'
3
+ require_relative 'values_collector'
4
+ require_relative '../page'
5
+
6
+ module HtmlEntry
7
+ module Page
8
+ ##
9
+ # This entity-html_entry class designed for reading data from HTML/XML block
10
+ # according to instructions
11
+ #
12
+ # @see tests/html_entry/page/test_entity_fetcher.rb
13
+ #
14
+ class EntityFetcher
15
+ ##
16
+ # Cache fetched XML elements
17
+ # @type [Hash]
18
+
19
+ @selector_cache = {}
20
+
21
+ # Get instructions
22
+ #
23
+ # @return [Array]
24
+ #
25
+ attr_reader :instructions
26
+
27
+ ##
28
+ # Init
29
+ #
30
+ def initialize
31
+ @selector_cache ||= {}
32
+ end
33
+
34
+ ##
35
+ # Set instructions
36
+ #
37
+ # Example for reading simple text by CSS selector:
38
+ # {
39
+ # :name1 => {
40
+ # :type => :instruction,
41
+ # :selector => '.test-block a.deep-in',
42
+ # }
43
+ # }
44
+ # There are filters allowed for type :instruction :
45
+ # - :node_text, returns XML of found node
46
+ # - :node, returns object Nokogiri::XML::Element of found node
47
+ # - :no_strip, returns non-stripped text
48
+ # - by default it use .strip for found text
49
+ # Example for calculating instruction according to fetch fields:
50
+ # {
51
+ # :vote_up => {
52
+ # :type => :instruction,
53
+ # :selector => '.vote-up',
54
+ # },
55
+ # :vote_down => {
56
+ # :type => :instruction,
57
+ # :selector => '.vote-down',
58
+ # },
59
+ # :vote_diff => {
60
+ # :type => :function,
61
+ # :function => Proc.new { |info, name, document, instruction|
62
+ # info[:vote_up].to_i - info[:vote_down].to_i
63
+ # },
64
+ # }
65
+ # }
66
+ #
67
+ # @param [Array] instructions
68
+ # @return [self]
69
+
70
+ def instructions=(instructions)
71
+ instructions = [instructions] unless instructions.instance_of? Array
72
+
73
+ @instructions = instructions
74
+ end
75
+
76
+ ##
77
+ # Fetch data from document
78
+ #
79
+ # @param [Nokogiri::HTML::Document, Nokogiri::XML::Element] document
80
+ # @param [TrueClass, FalseClass] plenty Get plenty of elements
81
+ # or the only one
82
+ # @return [Hash, Array]
83
+
84
+ def fetch(document:, plenty: false)
85
+ if plenty
86
+ fetch_plenty(document)
87
+ else
88
+ fetch_single(document)
89
+ end
90
+ end
91
+
92
+ ##
93
+ # Fetch single data from document
94
+ #
95
+ # @param [Nokogiri::HTML::Document, Nokogiri::XML::Element] document
96
+ # @return [Hash]
97
+
98
+ def fetch_single(document)
99
+ collector = get_values_collector(document)
100
+
101
+ instructions.each do |instruction|
102
+ node = Page.fetch_node(document, instruction)
103
+
104
+ next unless instruction[:data]
105
+ instruction[:data].each do |name, data_instruction|
106
+ collector.fetch name, data_instruction, node
107
+ end
108
+ end
109
+
110
+ collector.data
111
+ end
112
+
113
+ ##
114
+ # Get value collector
115
+ #
116
+ # @param [Nokogiri::HTML::Document, Nokogiri::XML::Element] document
117
+ # @return [Page::ValuesCollector]
118
+
119
+ def get_values_collector(document)
120
+ Page::ValuesCollector.new document: document,
121
+ instructions: instructions
122
+ end
123
+
124
+ ##
125
+ # Fetch collection data from document
126
+ #
127
+ # @param [Nokogiri::HTML::Document, Nokogiri::XML::Element] document
128
+ # @return [Hash]
129
+ def fetch_plenty(document)
130
+ unless instructions.instance_of? Array
131
+ raise 'Instructions must be an array.'
132
+ end
133
+
134
+ collectors, data = process_instructions(document)
135
+
136
+ collectors.each do |_i, collector|
137
+ # @type [HtmlEntry::Page::ValuesCollector] collector
138
+ data.push collector.data
139
+ end
140
+
141
+ data
142
+ end
143
+
144
+ protected
145
+
146
+ def process_instructions(document)
147
+ data = []
148
+ # @type [HtmlEntry::Page::ValuesCollector[]] collectors
149
+ collectors = {}
150
+ instructions.each do |instruction|
151
+ raise 'Instruction must be Hash.' unless instruction.instance_of? Hash
152
+ nodes = retrieve_nodes(document, instruction)
153
+ nodes.each_with_index do |node, i|
154
+ process_node(document, node, instruction, collectors, i)
155
+ end
156
+ end
157
+ [collectors, data]
158
+ end
159
+
160
+ # @param [Array, Nil] instruction
161
+ def retrieve_nodes(document, instruction)
162
+ nodes = Page.fetch_nodes(document, instruction)
163
+ if nodes.nil? || instruction[:allow_empty] && nodes.count.zero?
164
+ nodes = [nil]
165
+ end
166
+ nodes
167
+ end
168
+
169
+ def process_node(document, node, instruction, collectors, index)
170
+ if instruction[:merge]
171
+ # gather items under the same collector
172
+ index = 0
173
+ end
174
+
175
+ unless collectors.key? index
176
+ collectors[index] = get_values_collector(document)
177
+ end
178
+
179
+ return unless instruction[:data]
180
+
181
+ instruction[:data].each do |name, data_instruction|
182
+ collectors[index].fetch name, data_instruction, node
183
+ end
184
+ end
185
+
186
+ ##
187
+ # Check if merge nodes data must disabled
188
+ #
189
+ def data_has_option?(instruction, option:, value:)
190
+ return false if instruction.key :merge
191
+
192
+ !instruction[:data].select! do |_k, el|
193
+ (el.is_a?(Hash) && (el[option] == value))
194
+ end.nil?
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,172 @@
1
+ require 'nokogiri'
2
+ require_relative '../error'
3
+
4
+ module HtmlEntry
5
+ module Page
6
+ ##
7
+ # This class responsible for getting values according to an instruction
8
+ #
9
+ # @see tests/html_entry/page/test_entity_fetcher.rb
10
+ #
11
+ class ValuesCollector
12
+ ##
13
+ # Extra options
14
+ #
15
+ # @type [Hash]
16
+ #
17
+ @options = {}
18
+
19
+ ##
20
+ # Collected data
21
+ #
22
+ # @type [Hash]
23
+ #
24
+ @data = {}
25
+
26
+ attr_reader :data
27
+
28
+ def initialize(options = {})
29
+ @options = options
30
+ @data = {}
31
+ end
32
+
33
+ ##
34
+ # Fetch value of element
35
+ #
36
+ # @param [Symbol] name
37
+ # @param [Hash] instruction
38
+ # @param [Nokogiri::XML::Element] node
39
+ # @return [String, Nokogiri::XML::Element]
40
+
41
+ def fetch(name, instruction, node)
42
+ if node && (instruction[:type] == :attribute)
43
+ value = get_node_attribute(
44
+ node,
45
+ instruction
46
+ )
47
+ elsif instruction[:type] == :function
48
+ value = call_function(name, instruction)
49
+ elsif instruction[:type] == :boolean || instruction[:type] == :bool
50
+ value = !!node
51
+ elsif node && instruction[:type] == :children
52
+ value = children(
53
+ name: name,
54
+ instruction: instruction,
55
+ node: node,
56
+ plenty: if instruction[:children_plenty].nil?
57
+ true
58
+ else
59
+ instruction[:children_plenty]
60
+ end
61
+ )
62
+ elsif node && (instruction[:type] == :value || instruction[:type].nil?)
63
+ # empty type should be determined as :value
64
+ value = node
65
+ elsif instruction.is_a?(Hash) && !instruction[:default].nil?
66
+ value = instruction[:default]
67
+ elsif node.nil?
68
+ value = nil
69
+ else
70
+ raise HtmlEntry::Error, 'Unknown instruction type or XML/HTML value not found.'
71
+ end
72
+
73
+ value = filter_value(value, instruction)
74
+ if data[name].instance_of?(Array) && value.instance_of?(Array)
75
+ data[name] = [data[name], value].flatten
76
+ else
77
+ unless data[name].nil? && (!instruction[:overwrite])
78
+ raise "Value already set for data key name '#{name}'."
79
+ end
80
+ data[name] = value
81
+ end
82
+
83
+ data[name]
84
+ end
85
+
86
+ protected
87
+
88
+ ##
89
+ # Fetch value of element
90
+ #
91
+ # @param [Symbol] name
92
+ # @param [Hash] instruction
93
+ # @param [Nokogiri::XML::Element] node
94
+ # @return [Hash, Array]
95
+ #
96
+ def children(instruction:, node:, plenty: nil, name: nil)
97
+ instruction = if instruction[:instructions] == :the_same
98
+ @options[:instructions]
99
+ else
100
+ instruction[:instructions]
101
+ end
102
+
103
+ fetcher = Page::EntityFetcher.new
104
+ fetcher.instructions = instruction
105
+ fetcher.fetch document: node,
106
+ plenty: plenty.nil? ? true : plenty
107
+ end
108
+
109
+ ##
110
+ # Filter value
111
+ #
112
+ # @param [Nokogiri::XML::Element] value
113
+ # @param [Hash] instruction
114
+ # @return [String, Nokogiri::XML::Element]
115
+ #
116
+ def filter_value(value, instruction)
117
+ filter(value, instruction[:filter])
118
+ end
119
+
120
+ ##
121
+ # Filter fetched value
122
+ #
123
+ # @param [Nokogiri::XML::Element] value
124
+ # @param [Symbol] filter
125
+ # @return [String, Nokogiri::XML::Element]
126
+ #
127
+ def filter(value, filter = nil)
128
+ # return as is, :filter can be omitted in instruction
129
+ return value if filter == :element
130
+
131
+ # return text with tags
132
+ return value.to_s.strip if filter == :node_text
133
+
134
+ # return text without tags by default
135
+ value = value.text if value.instance_of?(Nokogiri::XML::Element)
136
+
137
+ # return integer
138
+ return value.to_i if filter == :to_i
139
+
140
+ # return non-stripped text
141
+ return value.to_s if filter == :no_strip
142
+
143
+ return value.strip if value.is_a? String
144
+
145
+ value
146
+ end
147
+
148
+ ##
149
+ # @param [Nokogiri::XML::Element] node
150
+ # @param [Hash] instruction
151
+ # @return [String]
152
+
153
+ def get_node_attribute(node, instruction)
154
+ node[instruction[:attribute]]
155
+ end
156
+
157
+ ##
158
+ # Call custom function
159
+ #
160
+ # @param [Hash] instruction
161
+ # @return [*]
162
+
163
+ def call_function(name, instruction)
164
+ if instruction[:function].instance_of? Proc
165
+ instruction[:function].call name, instruction, data, @options
166
+ else
167
+ HtmlEntry::Error.new ':function is not instance of Proc'
168
+ end
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,44 @@
1
+ module HtmlEntry
2
+ ##
3
+ # Page module
4
+ #
5
+ module Page
6
+ ##
7
+ # Get node by XPath or CSS selector
8
+ #
9
+ # @param [Nokogiri::HTML::Document] document
10
+ # @param [Hash] instruction
11
+ # @return [Nokogiri::XML::Element]
12
+
13
+ def fetch_node(document, instruction)
14
+ nodes = fetch_nodes(document, instruction)
15
+ nodes.first if nodes
16
+ end
17
+
18
+ ##
19
+ # Get nodes by XPath or CSS selector
20
+ #
21
+ # @param [Nokogiri::HTML::Document|Nokogiri::XML::Element] document
22
+ # @param [Hash] instruction
23
+ # @return [Nokogiri::XML::NodeSet]
24
+
25
+ def fetch_nodes(document, instruction)
26
+ unless document.instance_of?(Nokogiri::HTML::Document) || document.instance_of?(Nokogiri::XML::Element)
27
+ raise '"document" must be an instance of Nokogiri::HTML::Document.'
28
+ end
29
+ if instruction[:selector]
30
+ document.css(instruction[:selector])
31
+ elsif instruction[:css]
32
+ document.css(instruction[:css])
33
+ elsif instruction[:xpath]
34
+ if defined? document.xpath
35
+ document.xpath(instruction[:xpath])
36
+ else
37
+ raise 'Cannot use this document.'
38
+ end
39
+ end
40
+ end
41
+
42
+ module_function :fetch_nodes, :fetch_node
43
+ end
44
+ end
@@ -0,0 +1,109 @@
1
+ require_relative 'page/entity_fetcher'
2
+
3
+ module HtmlEntry
4
+ ##
5
+ # Page fetcher
6
+ #
7
+ class PageFetcher
8
+ ##
9
+ # Set instructions
10
+ #
11
+ # @param [Hash] instructions
12
+ # @return [self]
13
+
14
+ attr_writer :instructions
15
+
16
+ # Get instructions
17
+ #
18
+ # @return [Hash]
19
+
20
+ attr_reader :instructions
21
+
22
+ # Fetch entities from document
23
+ #
24
+ # @param [Nokogiri::HTML::Document] document
25
+ # @return [Hash]
26
+
27
+ def fetch(document)
28
+ items = []
29
+ if instructions[:block].nil?
30
+ # "block" instructions is not defined
31
+ block_document = if document.instance_of?(Nokogiri::HTML::Document)
32
+ fetch_block_document(
33
+ document,
34
+ type: :selector,
35
+ selector: 'body'
36
+ ).first
37
+ else
38
+ document
39
+ end
40
+
41
+ fetch_data(block_document, instructions[:entity]).each do |element|
42
+ items.push element
43
+ end
44
+ else
45
+ # fetch each "block" and process entities
46
+ fetch_block_document(document, instructions[:block]).each do |block_document|
47
+ fetch_data(block_document, instructions[:entity]).each do |element|
48
+ items.push element
49
+ end
50
+ end
51
+ end
52
+ items
53
+ end
54
+
55
+ ##
56
+ # Check if it's a last page
57
+ #
58
+ # @param [Nokogiri::HTML::Document] document
59
+ # @return [TrueClass, FalseClass]
60
+
61
+ def last_page?(document)
62
+ if instructions[:last_page][:type] == :function
63
+ !!call_function(document, instructions[:last_page])
64
+ else
65
+ Page.fetch_nodes(document, instructions[:last_page]).count > 0
66
+ end
67
+ end
68
+
69
+ protected
70
+
71
+ ##
72
+ # Fetch entity data
73
+ #
74
+ # @param [Nokogiri::XML::Element] entity_document
75
+ # @param [Hash] instructions
76
+ # @return [Hash]
77
+
78
+ def fetch_data(entity_document, instructions)
79
+ fetcher = Page::EntityFetcher.new
80
+ fetcher.instructions = instructions
81
+ fetcher.fetch(document: entity_document, plenty: true)
82
+ end
83
+
84
+ ##
85
+ # Fetch entities on a page
86
+ #
87
+ # @param [Nokogiri::HTML::Document] document
88
+ # @return [Nokogiri::XML::NodeSet]
89
+
90
+ def fetch_block_document(document, instructions)
91
+ raise 'Instructions are not set.' if instructions.nil?
92
+
93
+ return call_function(document, instructions) if instructions[:type] == :function
94
+
95
+ Page.fetch_nodes(document, instructions)
96
+ end
97
+
98
+ ##
99
+ # Call custom function
100
+ #
101
+ # @param [Nokogiri::HTML::Document] document
102
+ # @param [Hash] instruction
103
+ # @return [*]
104
+
105
+ def call_function(document, instruction)
106
+ instruction[:function].call document, instruction
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,3 @@
1
+ module HtmlEntry
2
+ VERSION = '0.1.0'.freeze
3
+ end
data/lib/html_entry.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'html_entry/version'
2
+
3
+ ##
4
+ # Html entry top module
5
+ module HtmlEntry
6
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_entry
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kirby Rs
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-08-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ description: |-
56
+ A simple gem which allows to organize fetching entries \
57
+ from plain HTML.
58
+ email: bizkirby@gmail.com
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - lib/html_entry.rb
64
+ - lib/html_entry/error.rb
65
+ - lib/html_entry/page.rb
66
+ - lib/html_entry/page/entity_fetcher.rb
67
+ - lib/html_entry/page/values_collector.rb
68
+ - lib/html_entry/page_fetcher.rb
69
+ - lib/html_entry/version.rb
70
+ homepage: https://github.com/rikby/html-entries
71
+ licenses:
72
+ - GPL-3.0
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 2.7.7
91
+ signing_key:
92
+ specification_version: 4
93
+ summary: HTML entries fetcher
94
+ test_files: []