ae_easy-text 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,283 @@
1
+ require 'cgi'
2
+ require 'json'
3
+ require 'digest/sha1'
4
+ require 'ae_easy-core'
5
+ require 'ae_easy/text/version'
6
+
7
+ module AeEasy
8
+ module Text
9
+ # Create a hash from object
10
+ #
11
+ # @param [String,Hash,Object] object Object to create hash from.
12
+ #
13
+ # @return [String]
14
+ def self.hash object
15
+ object = object.hash if object.is_a? Hash
16
+ Digest::SHA1.hexdigest object.to_s
17
+ end
18
+
19
+ # Encode text for valid HTML entities.
20
+ #
21
+ # @param [String] text Text to encode.
22
+ #
23
+ # @return [String]
24
+ def self.encode_html text
25
+ CGI.escapeHTML text
26
+ end
27
+
28
+ # Decode HTML entities from text .
29
+ #
30
+ # @param [String] text Text to decode.
31
+ #
32
+ # @return [String]
33
+ def self.decode_html text
34
+ CGI.unescapeHTML text
35
+ end
36
+
37
+ # Strip a value.
38
+ #
39
+ # @param [String,Object,nil] raw_text Text to strip.
40
+ #
41
+ # @return [String,nil] `nil` when +raw_text+ is nil, else `String`.
42
+ def self.strip raw_text
43
+ return nil if raw_text.nil?
44
+ raw_text = raw_text.to_s unless raw_text.is_a? String
45
+ regex = /(\s|\u3000|\u00a0)+/
46
+ good_encoding = (raw_text =~ /\u3000/ || true) rescue false
47
+ unless good_encoding
48
+ raw_text = raw_text.force_encoding($APP_CONFIG[:encoding]).encode('UTF-8')
49
+ regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
50
+ end
51
+ text = raw_text&.gsub(regex, ' ')&.strip
52
+ text.nil? ? nil : decode_html(text)
53
+ end
54
+
55
+ # Default cell content parser used to parse cell element.
56
+ #
57
+ # @param [Nokogiri::Element] cell_element Cell element to parse.
58
+ # @param [Hash] data Data hash to save parsed data into.
59
+ # @param [String,Symbol] key Header column key being parsed.
60
+ def self.default_parser cell_element, data, key
61
+ cell_element&.search('//i').remove
62
+ row_data[key] = strip cell_element&.text
63
+ end
64
+
65
+ # Parse row data matching a selector using a header map to translate
66
+ # between columns and friendly keys.
67
+ #
68
+ # @param [Hash] opts ({}) Configuration options.
69
+ # @option opts [Nokogiri::Element] :html Container element to search into.
70
+ # @option opts [String] :selector CSS selector to match content cells.
71
+ # @option opts [Boolean] :first_row_header (false) If true then first
72
+ # matching element will be assumed to be header and ignored.
73
+ # @option opts [Hash{Symbol,String => Integer}] :header_map Header key vs
74
+ # index dictionary.
75
+ # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
76
+ # Custom column parsers for advance data extraction.
77
+ #
78
+ # @yieldparam [Hash{Symbol,String => Object}] data Parsed row data.
79
+ # @yieldparam [Array] row Raw row data.
80
+ # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
81
+ # @yieldreturn [Boolean] `true` when valid, else `false`.
82
+ #
83
+ # @return [Array<Hash>,nil] Parsed rows data.
84
+ def self.parse_content opts, &filter
85
+ opts = {
86
+ html: nil,
87
+ selector: nil,
88
+ first_row_header: false,
89
+ header_map: {},
90
+ column_parsers: {}
91
+ }.merge opts
92
+
93
+ # Setup config
94
+ data = []
95
+ row_data = child_element = nil
96
+ first = first_row_header = opts[:first_row_header]
97
+ header_map = opts[:header_map]
98
+ column_parsers = opts[:column_parsers]
99
+
100
+ # Get and parse rows
101
+ html_rows = opts[:html].css(opts[:selector])
102
+ html_rows.each do |row|
103
+ # First row header validation
104
+ if first && first_row_header
105
+ first = false
106
+ next
107
+ end
108
+
109
+ # Extract content data
110
+ row_data = {}
111
+ header_map.each do |key, index|
112
+ # Parse column html with default or custom parser
113
+ child_element = row.children[index]
114
+ column_parsers[key].nil? ?
115
+ default_parser(child_element, row_data, key) :
116
+ column_parsers[key].call(child_element, row_data, key)
117
+ end
118
+ next unless filter.nil? || filter.call(row_data, row, header_map)
119
+ data << row_data
120
+ end
121
+ data
122
+ end
123
+
124
+ # Extract column label and translate it into a frienly key.
125
+ #
126
+ # @param [Nokogiri::Element] element Html element to parse.
127
+ # @param [Hash{Symbol,String => Regex,String}] label_map Label dictionary
128
+ # for translation into key.
129
+ #
130
+ # @return [Symbol,String] Translated key.
131
+ def self.translate_label_to_key element, label_map
132
+ element&.search('//i').remove
133
+ text = strip element&.text
134
+ key = label_map.find do |k,v|
135
+ v.is_a?(Regexp) ? (text =~ v) : (text == v)
136
+ end&.first
137
+ key
138
+ end
139
+
140
+ # Parse header from selector and create a header map to match a column key
141
+ # with column index.
142
+ #
143
+ # @param [Hash] opts ({}) Configuration options.
144
+ # @option opts [Nokogiri::Element] :html Container element to search into.
145
+ # @option opts [String] :selector CSS selector to match header cells.
146
+ # @option opts [Hash{Symbol,String => Regex,String}] :column_key_label_map
147
+ # Key vs. label dictionary.
148
+ # @option opts [Boolean] :first_row_header (false) If true then selector
149
+ # first matching row will be used as header for parsing.
150
+ #
151
+ # @return [Hash{Symbol,String => Integer},nil] Key vs. column index map.
152
+ def self.parse_header_map opts = {}
153
+ opts = {
154
+ html: nil,
155
+ selector: nil,
156
+ column_key_label_map: {},
157
+ first_row_header: false
158
+ }.merge opts
159
+
160
+ # Setup config
161
+ dictionary = opts[:column_key_label_map]
162
+ data = []
163
+ column_map = nil
164
+
165
+ # Extract and parse header rows
166
+ html_rows = opts[:html].css(opts[:selector]) rescue nil
167
+ return nil if html_rows.nil?
168
+ html_rows = [html_rows.first] if opts[:first_row_header]
169
+ html_rows.each do |row|
170
+ column_map = {}
171
+ row.children.each_with_index do |col, index|
172
+ # Parse and map column header
173
+ column_key = translate_label_to_key col, dictionary
174
+ next if column_key.nil?
175
+ column_map[column_key] = index
176
+ end
177
+ data << column_map
178
+ end
179
+ data&.first
180
+ end
181
+
182
+ # Parse data from a horizontal table like structure matching a selectors and
183
+ # using a header map to match columns.
184
+ #
185
+ # @param [Hash] opts ({}) Configuration options.
186
+ # @option opts [Nokogiri::Element] :html Container element to search into.
187
+ # @option opts [String] :header_selector Header column elements selector.
188
+ # @option opts [Hash{Symbol,String => Regex,String}] :header_key_label_map
189
+ # Header key vs. label dictionary to match column indexes.
190
+ # @option opts [String] :content_selector Content row elements selector.
191
+ # @option opts [Boolean] :first_row_header (false) If true then selector
192
+ # first matching row will be used as header for parsing.
193
+ # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
194
+ # Custom column parsers for advance data extraction.
195
+ #
196
+ # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
197
+ # @yieldparam [Array] row Raw content row data.
198
+ # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
199
+ # @yieldreturn [Boolean] `true` when valid, else `false`.
200
+ #
201
+ # @return [Hash{Symbol => Array,Hash,nil}] Hash data is as follows:
202
+ # * `[Hash] :header_map` Header map used.
203
+ # * `[Array<Hash>,nil] :data` Parsed rows data.
204
+ def self.parse_table opts = {}, &filter
205
+ opts = {
206
+ html: nil,
207
+ header_selector: nil,
208
+ header_key_label_map: {},
209
+ content_selector: nil,
210
+ first_row_header: false,
211
+ column_parsers: {}
212
+ }.merge opts
213
+ return nil if opts[:html].nil?
214
+ header_map = self.parse_header_map html: opts[:html],
215
+ selector: opts[:header_selector],
216
+ column_key_label_map: opts[:header_key_label_map],
217
+ first_row_header: opts[:first_row_header]
218
+ return nil if header_map.nil?
219
+ data = self.parse_content html: opts[:html],
220
+ selector: opts[:content_selector],
221
+ header_map: header_map,
222
+ first_row_header: opts[:first_row_header],
223
+ column_parsers: opts[:column_parsers],
224
+ &filter
225
+ {header_map: header_map, data: data}
226
+ end
227
+
228
+ # Parse data from a vertical table like structure matching a selectors and
229
+ # using a header map to match columns.
230
+ #
231
+ # @param [Hash] opts ({}) Configuration options.
232
+ # @option opts [Nokogiri::Element] :html Container element to search into.
233
+ # @option opts [String] :row_selector Vertical row like elements selector.
234
+ # @option opts [String] :header_selector Header column elements selector.
235
+ # @option opts [Hash{Symbol,String => Regex,String}] :header_key_label_map
236
+ # Header key vs. label dictionary to match column indexes.
237
+ # @option opts [String] :content_selector Content row elements selector.
238
+ # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
239
+ # Custom column parsers for advance data extraction.
240
+ #
241
+ # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
242
+ # @yieldparam [Array] row Raw content row data.
243
+ # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
244
+ # @yieldreturn [Boolean] `true` when valid, else `false`.
245
+ #
246
+ # @return [Hash{Symbol => Array,Hash,nil}] Hash data is as follows:
247
+ # * `[Hash] :header_map` Header map used.
248
+ # * `[Array<Hash>,nil] :data` Parsed rows data.
249
+ def self.parse_vertical_table opts = {}, &filter
250
+ opts = {
251
+ html: nil,
252
+ row_selector: nil,
253
+ header_selector: nil,
254
+ header_key_label_map: {},
255
+ content_selector: nil,
256
+ column_parsers: {}
257
+ }.merge opts
258
+ return nil if opts[:html].nil?
259
+
260
+ # Setup config
261
+ data = {}
262
+ dictionary = opts[:header_key_label_map]
263
+ column_parsers = opts[:column_parsers]
264
+
265
+ # Extract headers and content
266
+ html_rows = opts[:html].css(opts[:row_selector]) rescue nil
267
+ return nil if html_rows.nil?
268
+ html_rows.each do |row|
269
+ # Parse and map column header
270
+ header_element = row.css(opts[:header_selector])
271
+ key = translate_label_to_key header_element, dictionary
272
+ next if key.nil? || key == ''
273
+
274
+ # Parse column html with default or custom parser
275
+ content_element = row.css(opts[:content_selector])
276
+ column_parsers[key].nil? ?
277
+ default_parser(content_element, data, key) :
278
+ column_parsers[key].call(content_element, data, key)
279
+ end
280
+ data
281
+ end
282
+ end
283
+ end
@@ -0,0 +1,6 @@
1
+ module AeEasy
2
+ module Text
3
+ # Gem version
4
+ VERSION = "0.0.1"
5
+ end
6
+ end
metadata ADDED
@@ -0,0 +1,186 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ae_easy-text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Eduardo Rosales
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-02-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ae_easy-core
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.16.3
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.16.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '5.11'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '5.11'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 0.16.1
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 0.16.1
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov-console
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.4.2
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: 0.4.2
97
+ - !ruby/object:Gem::Dependency
98
+ name: timecop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: 0.9.1
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: 0.9.1
111
+ - !ruby/object:Gem::Dependency
112
+ name: byebug
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: AnswersEngine Easy toolkit text module contains multiple text parsing
126
+ helpers.
127
+ email:
128
+ - eduardo@datahen.com
129
+ executables: []
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".gitignore"
134
+ - ".travis.yml"
135
+ - ".yardopts"
136
+ - CODE_OF_CONDUCT.md
137
+ - Gemfile
138
+ - LICENSE
139
+ - README.md
140
+ - Rakefile
141
+ - ae_easy-text.gemspec
142
+ - doc/AeEasy.html
143
+ - doc/AeEasy/Text.html
144
+ - doc/_index.html
145
+ - doc/class_list.html
146
+ - doc/css/common.css
147
+ - doc/css/full_list.css
148
+ - doc/css/style.css
149
+ - doc/file.README.html
150
+ - doc/file_list.html
151
+ - doc/frames.html
152
+ - doc/index.html
153
+ - doc/js/app.js
154
+ - doc/js/full_list.js
155
+ - doc/js/jquery.js
156
+ - doc/method_list.html
157
+ - doc/top-level-namespace.html
158
+ - lib/ae_easy/text.rb
159
+ - lib/ae_easy/text/version.rb
160
+ homepage: https://answersengine.com
161
+ licenses:
162
+ - MIT
163
+ metadata:
164
+ homepage_uri: https://answersengine.com
165
+ source_code_uri: https://github.com/answersengine/ae_easy-text
166
+ post_install_message:
167
+ rdoc_options: []
168
+ require_paths:
169
+ - lib
170
+ required_ruby_version: !ruby/object:Gem::Requirement
171
+ requirements:
172
+ - - ">="
173
+ - !ruby/object:Gem::Version
174
+ version: 2.2.2
175
+ required_rubygems_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ requirements: []
181
+ rubyforge_project:
182
+ rubygems_version: 2.7.6
183
+ signing_key:
184
+ specification_version: 4
185
+ summary: AnswersEngine Easy toolkit text module
186
+ test_files: []