ae_easy-text 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,283 @@
1
+ require 'cgi'
2
+ require 'json'
3
+ require 'digest/sha1'
4
+ require 'ae_easy-core'
5
+ require 'ae_easy/text/version'
6
+
7
+ module AeEasy
8
+ module Text
9
+ # Create a hash from object
10
+ #
11
+ # @param [String,Hash,Object] object Object to create hash from.
12
+ #
13
+ # @return [String]
14
+ def self.hash object
15
+ object = object.hash if object.is_a? Hash
16
+ Digest::SHA1.hexdigest object.to_s
17
+ end
18
+
19
+ # Encode text for valid HTML entities.
20
+ #
21
+ # @param [String] text Text to encode.
22
+ #
23
+ # @return [String]
24
+ def self.encode_html text
25
+ CGI.escapeHTML text
26
+ end
27
+
28
+ # Decode HTML entities from text .
29
+ #
30
+ # @param [String] text Text to decode.
31
+ #
32
+ # @return [String]
33
+ def self.decode_html text
34
+ CGI.unescapeHTML text
35
+ end
36
+
37
+ # Strip a value.
38
+ #
39
+ # @param [String,Object,nil] raw_text Text to strip.
40
+ #
41
+ # @return [String,nil] `nil` when +raw_text+ is nil, else `String`.
42
+ def self.strip raw_text
43
+ return nil if raw_text.nil?
44
+ raw_text = raw_text.to_s unless raw_text.is_a? String
45
+ regex = /(\s|\u3000|\u00a0)+/
46
+ good_encoding = (raw_text =~ /\u3000/ || true) rescue false
47
+ unless good_encoding
48
+ raw_text = raw_text.force_encoding($APP_CONFIG[:encoding]).encode('UTF-8')
49
+ regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
50
+ end
51
+ text = raw_text&.gsub(regex, ' ')&.strip
52
+ text.nil? ? nil : decode_html(text)
53
+ end
54
+
55
+ # Default cell content parser used to parse cell element.
56
+ #
57
+ # @param [Nokogiri::Element] cell_element Cell element to parse.
58
+ # @param [Hash] data Data hash to save parsed data into.
59
+ # @param [String,Symbol] key Header column key being parsed.
60
+ def self.default_parser cell_element, data, key
61
+ cell_element&.search('//i').remove
62
+ row_data[key] = strip cell_element&.text
63
+ end
64
+
65
+ # Parse row data matching a selector using a header map to translate
66
+ # between columns and friendly keys.
67
+ #
68
+ # @param [Hash] opts ({}) Configuration options.
69
+ # @option opts [Nokogiri::Element] :html Container element to search into.
70
+ # @option opts [String] :selector CSS selector to match content cells.
71
+ # @option opts [Boolean] :first_row_header (false) If true then first
72
+ # matching element will be assumed to be header and ignored.
73
+ # @option opts [Hash{Symbol,String => Integer}] :header_map Header key vs
74
+ # index dictionary.
75
+ # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
76
+ # Custom column parsers for advance data extraction.
77
+ #
78
+ # @yieldparam [Hash{Symbol,String => Object}] data Parsed row data.
79
+ # @yieldparam [Array] row Raw row data.
80
+ # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
81
+ # @yieldreturn [Boolean] `true` when valid, else `false`.
82
+ #
83
+ # @return [Array<Hash>,nil] Parsed rows data.
84
+ def self.parse_content opts, &filter
85
+ opts = {
86
+ html: nil,
87
+ selector: nil,
88
+ first_row_header: false,
89
+ header_map: {},
90
+ column_parsers: {}
91
+ }.merge opts
92
+
93
+ # Setup config
94
+ data = []
95
+ row_data = child_element = nil
96
+ first = first_row_header = opts[:first_row_header]
97
+ header_map = opts[:header_map]
98
+ column_parsers = opts[:column_parsers]
99
+
100
+ # Get and parse rows
101
+ html_rows = opts[:html].css(opts[:selector])
102
+ html_rows.each do |row|
103
+ # First row header validation
104
+ if first && first_row_header
105
+ first = false
106
+ next
107
+ end
108
+
109
+ # Extract content data
110
+ row_data = {}
111
+ header_map.each do |key, index|
112
+ # Parse column html with default or custom parser
113
+ child_element = row.children[index]
114
+ column_parsers[key].nil? ?
115
+ default_parser(child_element, row_data, key) :
116
+ column_parsers[key].call(child_element, row_data, key)
117
+ end
118
+ next unless filter.nil? || filter.call(row_data, row, header_map)
119
+ data << row_data
120
+ end
121
+ data
122
+ end
123
+
124
+ # Extract column label and translate it into a frienly key.
125
+ #
126
+ # @param [Nokogiri::Element] element Html element to parse.
127
+ # @param [Hash{Symbol,String => Regex,String}] label_map Label dictionary
128
+ # for translation into key.
129
+ #
130
+ # @return [Symbol,String] Translated key.
131
+ def self.translate_label_to_key element, label_map
132
+ element&.search('//i').remove
133
+ text = strip element&.text
134
+ key = label_map.find do |k,v|
135
+ v.is_a?(Regexp) ? (text =~ v) : (text == v)
136
+ end&.first
137
+ key
138
+ end
139
+
140
+ # Parse header from selector and create a header map to match a column key
141
+ # with column index.
142
+ #
143
+ # @param [Hash] opts ({}) Configuration options.
144
+ # @option opts [Nokogiri::Element] :html Container element to search into.
145
+ # @option opts [String] :selector CSS selector to match header cells.
146
+ # @option opts [Hash{Symbol,String => Regex,String}] :column_key_label_map
147
+ # Key vs. label dictionary.
148
+ # @option opts [Boolean] :first_row_header (false) If true then selector
149
+ # first matching row will be used as header for parsing.
150
+ #
151
+ # @return [Hash{Symbol,String => Integer},nil] Key vs. column index map.
152
+ def self.parse_header_map opts = {}
153
+ opts = {
154
+ html: nil,
155
+ selector: nil,
156
+ column_key_label_map: {},
157
+ first_row_header: false
158
+ }.merge opts
159
+
160
+ # Setup config
161
+ dictionary = opts[:column_key_label_map]
162
+ data = []
163
+ column_map = nil
164
+
165
+ # Extract and parse header rows
166
+ html_rows = opts[:html].css(opts[:selector]) rescue nil
167
+ return nil if html_rows.nil?
168
+ html_rows = [html_rows.first] if opts[:first_row_header]
169
+ html_rows.each do |row|
170
+ column_map = {}
171
+ row.children.each_with_index do |col, index|
172
+ # Parse and map column header
173
+ column_key = translate_label_to_key col, dictionary
174
+ next if column_key.nil?
175
+ column_map[column_key] = index
176
+ end
177
+ data << column_map
178
+ end
179
+ data&.first
180
+ end
181
+
182
+ # Parse data from a horizontal table like structure matching a selectors and
183
+ # using a header map to match columns.
184
+ #
185
+ # @param [Hash] opts ({}) Configuration options.
186
+ # @option opts [Nokogiri::Element] :html Container element to search into.
187
+ # @option opts [String] :header_selector Header column elements selector.
188
+ # @option opts [Hash{Symbol,String => Regex,String}] :header_key_label_map
189
+ # Header key vs. label dictionary to match column indexes.
190
+ # @option opts [String] :content_selector Content row elements selector.
191
+ # @option opts [Boolean] :first_row_header (false) If true then selector
192
+ # first matching row will be used as header for parsing.
193
+ # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
194
+ # Custom column parsers for advance data extraction.
195
+ #
196
+ # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
197
+ # @yieldparam [Array] row Raw content row data.
198
+ # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
199
+ # @yieldreturn [Boolean] `true` when valid, else `false`.
200
+ #
201
+ # @return [Hash{Symbol => Array,Hash,nil}] Hash data is as follows:
202
+ # * `[Hash] :header_map` Header map used.
203
+ # * `[Array<Hash>,nil] :data` Parsed rows data.
204
+ def self.parse_table opts = {}, &filter
205
+ opts = {
206
+ html: nil,
207
+ header_selector: nil,
208
+ header_key_label_map: {},
209
+ content_selector: nil,
210
+ first_row_header: false,
211
+ column_parsers: {}
212
+ }.merge opts
213
+ return nil if opts[:html].nil?
214
+ header_map = self.parse_header_map html: opts[:html],
215
+ selector: opts[:header_selector],
216
+ column_key_label_map: opts[:header_key_label_map],
217
+ first_row_header: opts[:first_row_header]
218
+ return nil if header_map.nil?
219
+ data = self.parse_content html: opts[:html],
220
+ selector: opts[:content_selector],
221
+ header_map: header_map,
222
+ first_row_header: opts[:first_row_header],
223
+ column_parsers: opts[:column_parsers],
224
+ &filter
225
+ {header_map: header_map, data: data}
226
+ end
227
+
228
+ # Parse data from a vertical table like structure matching a selectors and
229
+ # using a header map to match columns.
230
+ #
231
+ # @param [Hash] opts ({}) Configuration options.
232
+ # @option opts [Nokogiri::Element] :html Container element to search into.
233
+ # @option opts [String] :row_selector Vertical row like elements selector.
234
+ # @option opts [String] :header_selector Header column elements selector.
235
+ # @option opts [Hash{Symbol,String => Regex,String}] :header_key_label_map
236
+ # Header key vs. label dictionary to match column indexes.
237
+ # @option opts [String] :content_selector Content row elements selector.
238
+ # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
239
+ # Custom column parsers for advance data extraction.
240
+ #
241
+ # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
242
+ # @yieldparam [Array] row Raw content row data.
243
+ # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
244
+ # @yieldreturn [Boolean] `true` when valid, else `false`.
245
+ #
246
+ # @return [Hash{Symbol => Array,Hash,nil}] Hash data is as follows:
247
+ # * `[Hash] :header_map` Header map used.
248
+ # * `[Array<Hash>,nil] :data` Parsed rows data.
249
+ def self.parse_vertical_table opts = {}, &filter
250
+ opts = {
251
+ html: nil,
252
+ row_selector: nil,
253
+ header_selector: nil,
254
+ header_key_label_map: {},
255
+ content_selector: nil,
256
+ column_parsers: {}
257
+ }.merge opts
258
+ return nil if opts[:html].nil?
259
+
260
+ # Setup config
261
+ data = {}
262
+ dictionary = opts[:header_key_label_map]
263
+ column_parsers = opts[:column_parsers]
264
+
265
+ # Extract headers and content
266
+ html_rows = opts[:html].css(opts[:row_selector]) rescue nil
267
+ return nil if html_rows.nil?
268
+ html_rows.each do |row|
269
+ # Parse and map column header
270
+ header_element = row.css(opts[:header_selector])
271
+ key = translate_label_to_key header_element, dictionary
272
+ next if key.nil? || key == ''
273
+
274
+ # Parse column html with default or custom parser
275
+ content_element = row.css(opts[:content_selector])
276
+ column_parsers[key].nil? ?
277
+ default_parser(content_element, data, key) :
278
+ column_parsers[key].call(content_element, data, key)
279
+ end
280
+ data
281
+ end
282
+ end
283
+ end
@@ -0,0 +1,6 @@
1
+ module AeEasy
2
+ module Text
3
+ # Gem version
4
+ VERSION = "0.0.1"
5
+ end
6
+ end
metadata ADDED
@@ -0,0 +1,186 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ae_easy-text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Eduardo Rosales
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-02-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ae_easy-core
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.16.3
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.16.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '5.11'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '5.11'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 0.16.1
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 0.16.1
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov-console
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.4.2
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: 0.4.2
97
+ - !ruby/object:Gem::Dependency
98
+ name: timecop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: 0.9.1
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: 0.9.1
111
+ - !ruby/object:Gem::Dependency
112
+ name: byebug
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: AnswersEngine Easy toolkit text module contains multiple text parsing
126
+ helpers.
127
+ email:
128
+ - eduardo@datahen.com
129
+ executables: []
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".gitignore"
134
+ - ".travis.yml"
135
+ - ".yardopts"
136
+ - CODE_OF_CONDUCT.md
137
+ - Gemfile
138
+ - LICENSE
139
+ - README.md
140
+ - Rakefile
141
+ - ae_easy-text.gemspec
142
+ - doc/AeEasy.html
143
+ - doc/AeEasy/Text.html
144
+ - doc/_index.html
145
+ - doc/class_list.html
146
+ - doc/css/common.css
147
+ - doc/css/full_list.css
148
+ - doc/css/style.css
149
+ - doc/file.README.html
150
+ - doc/file_list.html
151
+ - doc/frames.html
152
+ - doc/index.html
153
+ - doc/js/app.js
154
+ - doc/js/full_list.js
155
+ - doc/js/jquery.js
156
+ - doc/method_list.html
157
+ - doc/top-level-namespace.html
158
+ - lib/ae_easy/text.rb
159
+ - lib/ae_easy/text/version.rb
160
+ homepage: https://answersengine.com
161
+ licenses:
162
+ - MIT
163
+ metadata:
164
+ homepage_uri: https://answersengine.com
165
+ source_code_uri: https://github.com/answersengine/ae_easy-text
166
+ post_install_message:
167
+ rdoc_options: []
168
+ require_paths:
169
+ - lib
170
+ required_ruby_version: !ruby/object:Gem::Requirement
171
+ requirements:
172
+ - - ">="
173
+ - !ruby/object:Gem::Version
174
+ version: 2.2.2
175
+ required_rubygems_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ requirements: []
181
+ rubyforge_project:
182
+ rubygems_version: 2.7.6
183
+ signing_key:
184
+ specification_version: 4
185
+ summary: AnswersEngine Easy toolkit text module
186
+ test_files: []