ae_easy-text 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/.yardopts +1 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE +21 -0
- data/README.md +16 -0
- data/Rakefile +22 -0
- data/ae_easy-text.gemspec +49 -0
- data/doc/AeEasy.html +117 -0
- data/doc/AeEasy/Text.html +2024 -0
- data/doc/_index.html +122 -0
- data/doc/class_list.html +51 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +58 -0
- data/doc/css/style.css +496 -0
- data/doc/file.README.html +91 -0
- data/doc/file_list.html +56 -0
- data/doc/frames.html +17 -0
- data/doc/index.html +91 -0
- data/doc/js/app.js +292 -0
- data/doc/js/full_list.js +216 -0
- data/doc/js/jquery.js +4 -0
- data/doc/method_list.html +131 -0
- data/doc/top-level-namespace.html +110 -0
- data/lib/ae_easy/text.rb +283 -0
- data/lib/ae_easy/text/version.rb +6 -0
- metadata +186 -0
data/lib/ae_easy/text.rb
ADDED
@@ -0,0 +1,283 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'json'
|
3
|
+
require 'digest/sha1'
|
4
|
+
require 'ae_easy-core'
|
5
|
+
require 'ae_easy/text/version'
|
6
|
+
|
7
|
+
module AeEasy
|
8
|
+
module Text
|
9
|
+
# Create a hash from object
|
10
|
+
#
|
11
|
+
# @param [String,Hash,Object] object Object to create hash from.
|
12
|
+
#
|
13
|
+
# @return [String]
|
14
|
+
def self.hash object
|
15
|
+
object = object.hash if object.is_a? Hash
|
16
|
+
Digest::SHA1.hexdigest object.to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
# Encode text for valid HTML entities.
|
20
|
+
#
|
21
|
+
# @param [String] text Text to encode.
|
22
|
+
#
|
23
|
+
# @return [String]
|
24
|
+
def self.encode_html text
|
25
|
+
CGI.escapeHTML text
|
26
|
+
end
|
27
|
+
|
28
|
+
# Decode HTML entities from text .
|
29
|
+
#
|
30
|
+
# @param [String] text Text to decode.
|
31
|
+
#
|
32
|
+
# @return [String]
|
33
|
+
def self.decode_html text
|
34
|
+
CGI.unescapeHTML text
|
35
|
+
end
|
36
|
+
|
37
|
+
# Strip a value.
|
38
|
+
#
|
39
|
+
# @param [String,Object,nil] raw_text Text to strip.
|
40
|
+
#
|
41
|
+
# @return [String,nil] `nil` when +raw_text+ is nil, else `String`.
|
42
|
+
def self.strip raw_text
|
43
|
+
return nil if raw_text.nil?
|
44
|
+
raw_text = raw_text.to_s unless raw_text.is_a? String
|
45
|
+
regex = /(\s|\u3000|\u00a0)+/
|
46
|
+
good_encoding = (raw_text =~ /\u3000/ || true) rescue false
|
47
|
+
unless good_encoding
|
48
|
+
raw_text = raw_text.force_encoding($APP_CONFIG[:encoding]).encode('UTF-8')
|
49
|
+
regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
|
50
|
+
end
|
51
|
+
text = raw_text&.gsub(regex, ' ')&.strip
|
52
|
+
text.nil? ? nil : decode_html(text)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Default cell content parser used to parse cell element.
|
56
|
+
#
|
57
|
+
# @param [Nokogiri::Element] cell_element Cell element to parse.
|
58
|
+
# @param [Hash] data Data hash to save parsed data into.
|
59
|
+
# @param [String,Symbol] key Header column key being parsed.
|
60
|
+
def self.default_parser cell_element, data, key
|
61
|
+
cell_element&.search('//i').remove
|
62
|
+
row_data[key] = strip cell_element&.text
|
63
|
+
end
|
64
|
+
|
65
|
+
# Parse row data matching a selector using a header map to translate
|
66
|
+
# between columns and friendly keys.
|
67
|
+
#
|
68
|
+
# @param [Hash] opts ({}) Configuration options.
|
69
|
+
# @option opts [Nokogiri::Element] :html Container element to search into.
|
70
|
+
# @option opts [String] :selector CSS selector to match content cells.
|
71
|
+
# @option opts [Boolean] :first_row_header (false) If true then first
|
72
|
+
# matching element will be assumed to be header and ignored.
|
73
|
+
# @option opts [Hash{Symbol,String => Integer}] :header_map Header key vs
|
74
|
+
# index dictionary.
|
75
|
+
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
76
|
+
# Custom column parsers for advance data extraction.
|
77
|
+
#
|
78
|
+
# @yieldparam [Hash{Symbol,String => Object}] data Parsed row data.
|
79
|
+
# @yieldparam [Array] row Raw row data.
|
80
|
+
# @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
|
81
|
+
# @yieldreturn [Boolean] `true` when valid, else `false`.
|
82
|
+
#
|
83
|
+
# @return [Array<Hash>,nil] Parsed rows data.
|
84
|
+
def self.parse_content opts, &filter
|
85
|
+
opts = {
|
86
|
+
html: nil,
|
87
|
+
selector: nil,
|
88
|
+
first_row_header: false,
|
89
|
+
header_map: {},
|
90
|
+
column_parsers: {}
|
91
|
+
}.merge opts
|
92
|
+
|
93
|
+
# Setup config
|
94
|
+
data = []
|
95
|
+
row_data = child_element = nil
|
96
|
+
first = first_row_header = opts[:first_row_header]
|
97
|
+
header_map = opts[:header_map]
|
98
|
+
column_parsers = opts[:column_parsers]
|
99
|
+
|
100
|
+
# Get and parse rows
|
101
|
+
html_rows = opts[:html].css(opts[:selector])
|
102
|
+
html_rows.each do |row|
|
103
|
+
# First row header validation
|
104
|
+
if first && first_row_header
|
105
|
+
first = false
|
106
|
+
next
|
107
|
+
end
|
108
|
+
|
109
|
+
# Extract content data
|
110
|
+
row_data = {}
|
111
|
+
header_map.each do |key, index|
|
112
|
+
# Parse column html with default or custom parser
|
113
|
+
child_element = row.children[index]
|
114
|
+
column_parsers[key].nil? ?
|
115
|
+
default_parser(child_element, row_data, key) :
|
116
|
+
column_parsers[key].call(child_element, row_data, key)
|
117
|
+
end
|
118
|
+
next unless filter.nil? || filter.call(row_data, row, header_map)
|
119
|
+
data << row_data
|
120
|
+
end
|
121
|
+
data
|
122
|
+
end
|
123
|
+
|
124
|
+
# Extract column label and translate it into a frienly key.
|
125
|
+
#
|
126
|
+
# @param [Nokogiri::Element] element Html element to parse.
|
127
|
+
# @param [Hash{Symbol,String => Regex,String}] label_map Label dictionary
|
128
|
+
# for translation into key.
|
129
|
+
#
|
130
|
+
# @return [Symbol,String] Translated key.
|
131
|
+
def self.translate_label_to_key element, label_map
|
132
|
+
element&.search('//i').remove
|
133
|
+
text = strip element&.text
|
134
|
+
key = label_map.find do |k,v|
|
135
|
+
v.is_a?(Regexp) ? (text =~ v) : (text == v)
|
136
|
+
end&.first
|
137
|
+
key
|
138
|
+
end
|
139
|
+
|
140
|
+
# Parse header from selector and create a header map to match a column key
|
141
|
+
# with column index.
|
142
|
+
#
|
143
|
+
# @param [Hash] opts ({}) Configuration options.
|
144
|
+
# @option opts [Nokogiri::Element] :html Container element to search into.
|
145
|
+
# @option opts [String] :selector CSS selector to match header cells.
|
146
|
+
# @option opts [Hash{Symbol,String => Regex,String}] :column_key_label_map
|
147
|
+
# Key vs. label dictionary.
|
148
|
+
# @option opts [Boolean] :first_row_header (false) If true then selector
|
149
|
+
# first matching row will be used as header for parsing.
|
150
|
+
#
|
151
|
+
# @return [Hash{Symbol,String => Integer},nil] Key vs. column index map.
|
152
|
+
def self.parse_header_map opts = {}
|
153
|
+
opts = {
|
154
|
+
html: nil,
|
155
|
+
selector: nil,
|
156
|
+
column_key_label_map: {},
|
157
|
+
first_row_header: false
|
158
|
+
}.merge opts
|
159
|
+
|
160
|
+
# Setup config
|
161
|
+
dictionary = opts[:column_key_label_map]
|
162
|
+
data = []
|
163
|
+
column_map = nil
|
164
|
+
|
165
|
+
# Extract and parse header rows
|
166
|
+
html_rows = opts[:html].css(opts[:selector]) rescue nil
|
167
|
+
return nil if html_rows.nil?
|
168
|
+
html_rows = [html_rows.first] if opts[:first_row_header]
|
169
|
+
html_rows.each do |row|
|
170
|
+
column_map = {}
|
171
|
+
row.children.each_with_index do |col, index|
|
172
|
+
# Parse and map column header
|
173
|
+
column_key = translate_label_to_key col, dictionary
|
174
|
+
next if column_key.nil?
|
175
|
+
column_map[column_key] = index
|
176
|
+
end
|
177
|
+
data << column_map
|
178
|
+
end
|
179
|
+
data&.first
|
180
|
+
end
|
181
|
+
|
182
|
+
# Parse data from a horizontal table like structure matching a selectors and
|
183
|
+
# using a header map to match columns.
|
184
|
+
#
|
185
|
+
# @param [Hash] opts ({}) Configuration options.
|
186
|
+
# @option opts [Nokogiri::Element] :html Container element to search into.
|
187
|
+
# @option opts [String] :header_selector Header column elements selector.
|
188
|
+
# @option opts [Hash{Symbol,String => Regex,String}] :header_key_label_map
|
189
|
+
# Header key vs. label dictionary to match column indexes.
|
190
|
+
# @option opts [String] :content_selector Content row elements selector.
|
191
|
+
# @option opts [Boolean] :first_row_header (false) If true then selector
|
192
|
+
# first matching row will be used as header for parsing.
|
193
|
+
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
194
|
+
# Custom column parsers for advance data extraction.
|
195
|
+
#
|
196
|
+
# @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
|
197
|
+
# @yieldparam [Array] row Raw content row data.
|
198
|
+
# @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
|
199
|
+
# @yieldreturn [Boolean] `true` when valid, else `false`.
|
200
|
+
#
|
201
|
+
# @return [Hash{Symbol => Array,Hash,nil}] Hash data is as follows:
|
202
|
+
# * `[Hash] :header_map` Header map used.
|
203
|
+
# * `[Array<Hash>,nil] :data` Parsed rows data.
|
204
|
+
def self.parse_table opts = {}, &filter
|
205
|
+
opts = {
|
206
|
+
html: nil,
|
207
|
+
header_selector: nil,
|
208
|
+
header_key_label_map: {},
|
209
|
+
content_selector: nil,
|
210
|
+
first_row_header: false,
|
211
|
+
column_parsers: {}
|
212
|
+
}.merge opts
|
213
|
+
return nil if opts[:html].nil?
|
214
|
+
header_map = self.parse_header_map html: opts[:html],
|
215
|
+
selector: opts[:header_selector],
|
216
|
+
column_key_label_map: opts[:header_key_label_map],
|
217
|
+
first_row_header: opts[:first_row_header]
|
218
|
+
return nil if header_map.nil?
|
219
|
+
data = self.parse_content html: opts[:html],
|
220
|
+
selector: opts[:content_selector],
|
221
|
+
header_map: header_map,
|
222
|
+
first_row_header: opts[:first_row_header],
|
223
|
+
column_parsers: opts[:column_parsers],
|
224
|
+
&filter
|
225
|
+
{header_map: header_map, data: data}
|
226
|
+
end
|
227
|
+
|
228
|
+
# Parse data from a vertical table like structure matching a selectors and
|
229
|
+
# using a header map to match columns.
|
230
|
+
#
|
231
|
+
# @param [Hash] opts ({}) Configuration options.
|
232
|
+
# @option opts [Nokogiri::Element] :html Container element to search into.
|
233
|
+
# @option opts [String] :row_selector Vertical row like elements selector.
|
234
|
+
# @option opts [String] :header_selector Header column elements selector.
|
235
|
+
# @option opts [Hash{Symbol,String => Regex,String}] :header_key_label_map
|
236
|
+
# Header key vs. label dictionary to match column indexes.
|
237
|
+
# @option opts [String] :content_selector Content row elements selector.
|
238
|
+
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
239
|
+
# Custom column parsers for advance data extraction.
|
240
|
+
#
|
241
|
+
# @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
|
242
|
+
# @yieldparam [Array] row Raw content row data.
|
243
|
+
# @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
|
244
|
+
# @yieldreturn [Boolean] `true` when valid, else `false`.
|
245
|
+
#
|
246
|
+
# @return [Hash{Symbol => Array,Hash,nil}] Hash data is as follows:
|
247
|
+
# * `[Hash] :header_map` Header map used.
|
248
|
+
# * `[Array<Hash>,nil] :data` Parsed rows data.
|
249
|
+
def self.parse_vertical_table opts = {}, &filter
|
250
|
+
opts = {
|
251
|
+
html: nil,
|
252
|
+
row_selector: nil,
|
253
|
+
header_selector: nil,
|
254
|
+
header_key_label_map: {},
|
255
|
+
content_selector: nil,
|
256
|
+
column_parsers: {}
|
257
|
+
}.merge opts
|
258
|
+
return nil if opts[:html].nil?
|
259
|
+
|
260
|
+
# Setup config
|
261
|
+
data = {}
|
262
|
+
dictionary = opts[:header_key_label_map]
|
263
|
+
column_parsers = opts[:column_parsers]
|
264
|
+
|
265
|
+
# Extract headers and content
|
266
|
+
html_rows = opts[:html].css(opts[:row_selector]) rescue nil
|
267
|
+
return nil if html_rows.nil?
|
268
|
+
html_rows.each do |row|
|
269
|
+
# Parse and map column header
|
270
|
+
header_element = row.css(opts[:header_selector])
|
271
|
+
key = translate_label_to_key header_element, dictionary
|
272
|
+
next if key.nil? || key == ''
|
273
|
+
|
274
|
+
# Parse column html with default or custom parser
|
275
|
+
content_element = row.css(opts[:content_selector])
|
276
|
+
column_parsers[key].nil? ?
|
277
|
+
default_parser(content_element, data, key) :
|
278
|
+
column_parsers[key].call(content_element, data, key)
|
279
|
+
end
|
280
|
+
data
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
metadata
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ae_easy-text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Eduardo Rosales
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-02-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ae_easy-core
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.16.3
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.16.3
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5.11'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '5.11'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.16.1
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.16.1
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: simplecov-console
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.4.2
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.4.2
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: timecop
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 0.9.1
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.9.1
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: byebug
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: AnswersEngine Easy toolkit text module contains multiple text parsing
|
126
|
+
helpers.
|
127
|
+
email:
|
128
|
+
- eduardo@datahen.com
|
129
|
+
executables: []
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files: []
|
132
|
+
files:
|
133
|
+
- ".gitignore"
|
134
|
+
- ".travis.yml"
|
135
|
+
- ".yardopts"
|
136
|
+
- CODE_OF_CONDUCT.md
|
137
|
+
- Gemfile
|
138
|
+
- LICENSE
|
139
|
+
- README.md
|
140
|
+
- Rakefile
|
141
|
+
- ae_easy-text.gemspec
|
142
|
+
- doc/AeEasy.html
|
143
|
+
- doc/AeEasy/Text.html
|
144
|
+
- doc/_index.html
|
145
|
+
- doc/class_list.html
|
146
|
+
- doc/css/common.css
|
147
|
+
- doc/css/full_list.css
|
148
|
+
- doc/css/style.css
|
149
|
+
- doc/file.README.html
|
150
|
+
- doc/file_list.html
|
151
|
+
- doc/frames.html
|
152
|
+
- doc/index.html
|
153
|
+
- doc/js/app.js
|
154
|
+
- doc/js/full_list.js
|
155
|
+
- doc/js/jquery.js
|
156
|
+
- doc/method_list.html
|
157
|
+
- doc/top-level-namespace.html
|
158
|
+
- lib/ae_easy/text.rb
|
159
|
+
- lib/ae_easy/text/version.rb
|
160
|
+
homepage: https://answersengine.com
|
161
|
+
licenses:
|
162
|
+
- MIT
|
163
|
+
metadata:
|
164
|
+
homepage_uri: https://answersengine.com
|
165
|
+
source_code_uri: https://github.com/answersengine/ae_easy-text
|
166
|
+
post_install_message:
|
167
|
+
rdoc_options: []
|
168
|
+
require_paths:
|
169
|
+
- lib
|
170
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
171
|
+
requirements:
|
172
|
+
- - ">="
|
173
|
+
- !ruby/object:Gem::Version
|
174
|
+
version: 2.2.2
|
175
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - ">="
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '0'
|
180
|
+
requirements: []
|
181
|
+
rubyforge_project:
|
182
|
+
rubygems_version: 2.7.6
|
183
|
+
signing_key:
|
184
|
+
specification_version: 4
|
185
|
+
summary: AnswersEngine Easy toolkit text module
|
186
|
+
test_files: []
|