ae_easy-text 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '043945fa52e59d4b712944fc7ada1ac2de17b7d4aca575a74a329e6804676f8c'
4
- data.tar.gz: 06fe3a253c11ad468646fb2ed23db2ff5fcaf1f9dd387f8e8f4a18961f519916
3
+ metadata.gz: ca70d3930261e598c1ec9ebbb7abfdfbe9f330b616ac1b2319fc3fcf51f6e80c
4
+ data.tar.gz: 16e7e08587840e7aedd9b37171ccbbc84d21c8f97a0fcfcd2a7cf2daee86a9a1
5
5
  SHA512:
6
- metadata.gz: 247d51ceb25d3805bbc440f7475d4254c4c28cb297d83e598ccb5594f7428a9a4e6e1fda389ef3668a596f2660c5cc55422e3c6e438c6aae6cf7e4d1076ac1d5
7
- data.tar.gz: 005c59ad17927b016948d6a7b62af827910df99d9e4c31c4ba79e40fcd1953caf54da36ef5e555dd13d9ced4cce0f376ceb0a6104b5e8caee2ccd21eb6f339fb
6
+ metadata.gz: 44bcb6b53f4bae2385afc655835771dc20f10d5da9f17158049793292e837a06981b787bf23ad604f2bdfda7d7bdcc1128a2d31f38b4da0ee169e1508b0c555a
7
+ data.tar.gz: dd27481084e95be832506ec25235b674e2d2b13f47176b3b3a9f05fa9fa3114b14b88b7e7cfa76a0b1a5d30114f6912815bb23174701f3a75ebdb469cd193d76
@@ -55,7 +55,7 @@ further defined and clarified by project maintainers.
55
55
  ## Enforcement
56
56
 
57
57
  Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
- reported by contacting the project team at parama@answersengine.com. All
58
+ reported by contacting the project team at perry@datahen.com. All
59
59
  complaints will be reviewed and investigated and will result in a response that
60
60
  is deemed necessary and appropriate to the circumstances. The project team is
61
61
  obligated to maintain confidentiality with regard to the reporter of an incident.
data/Gemfile CHANGED
@@ -2,5 +2,5 @@ source "https://rubygems.org"
2
2
 
3
3
  git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
- # Specify your gem's dependencies in answersengine.gemspec
5
+ # Specify your gem's dependencies in ae_easy-text.gemspec
6
6
  gemspec
data/README.md CHANGED
@@ -5,12 +5,16 @@
5
5
  # AeEasy text module
6
6
  ## Description
7
7
 
8
- AeEasy text is part of AeEasy gem collection. It provides multiple text parsing helpers to ease common text parsing user cases.
8
+ AeEasy text is part of AeEasy gem collection and an alias to [dh_easy-text](https://github.com/DataHenOfficial/dh_easy-text/) gem. It provides multiple text parsing helpers to ease common text parsing user cases.
9
9
 
10
10
  Install gem:
11
- ```gem install 'ae_easy-text'```
11
+ ```ruby
12
+ gem install 'ae_easy-text'
13
+ ```
12
14
 
13
15
  Require gem:
14
- ```require 'ae_easy/text'```
16
+ ```ruby
17
+ require 'ae_easy/text'
18
+ ```
15
19
 
16
- Documentation can be found [here](http://rubydoc.org/gems/ae_easy-text/frames).
20
+ See [dh_easy-text](https://github.com/DataHenOfficial/dh_easy-text/) gem for documentation.
data/Rakefile CHANGED
@@ -9,14 +9,4 @@ Rake::TestTask.new do |t|
9
9
  t.test_files = FileList['./test/**/*_test.rb']
10
10
  end
11
11
 
12
- desc 'Benchmark another task execution | usage example: benchmark[my_task, param1, param2]'
13
- task :benchmark, [:task] do |task, args|
14
- task_name = args[:task]
15
- if task_name.nil?
16
- puts "Should select a task."
17
- exit 1
18
- end
19
- puts Benchmark.measure{ Rake::Task[task_name].invoke *args.extras }
20
- end
21
-
22
12
  task default: :test
@@ -1,17 +1,16 @@
1
1
 
2
2
  lib = File.expand_path("../lib", __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require "ae_easy/text/version"
5
4
 
6
5
  Gem::Specification.new do |spec|
7
6
  spec.name = "ae_easy-text"
8
- spec.version = AeEasy::Text::VERSION
7
+ spec.version = "0.0.5"
9
8
  spec.authors = ["Eduardo Rosales"]
10
9
  spec.email = ["eduardo@datahen.com"]
11
10
 
12
- spec.summary = %q{AnswersEngine Easy toolkit text module}
13
- spec.description = %q{AnswersEngine Easy toolkit text module contains multiple text parsing helpers.}
14
- spec.homepage = "https://answersengine.com"
11
+ spec.summary = %q{(Deprecated: Use datahen gem instead.) Compatibility alias for Datahen Easy toolkit text module}
12
+ spec.description = %q{(Deprecated: Use datahen gem instead.) Compatibility alias for Datahen Easy toolkit text module contains multiple text parsing helpers.}
13
+ spec.homepage = "https://datahen.com"
15
14
  spec.license = "MIT"
16
15
 
17
16
  # spec.cert_chain = ['certs/ae_easy.pem']
@@ -38,12 +37,10 @@ Gem::Specification.new do |spec|
38
37
  spec.require_paths = ["lib"]
39
38
  spec.required_ruby_version = '>= 2.2.2'
40
39
 
41
- spec.add_dependency 'ae_easy-core', '~> 0'
40
+ spec.add_dependency 'dh_easy-text', '~> 0'
41
+ spec.add_dependency 'ae_easy-core', '>= 0.2.1'
42
42
  spec.add_development_dependency 'bundler', '>= 1'
43
43
  spec.add_development_dependency 'rake', '~> 10'
44
44
  spec.add_development_dependency 'minitest', '~> 5'
45
- spec.add_development_dependency 'simplecov', '~> 0'
46
- spec.add_development_dependency 'simplecov-console', '~> 0'
47
- spec.add_development_dependency 'timecop', '~> 0'
48
45
  spec.add_development_dependency 'byebug', '>= 0'
49
46
  end
@@ -1,311 +1,5 @@
1
- require 'cgi'
2
- require 'json'
3
- require 'digest/sha1'
1
+ require 'dh_easy/text'
4
2
  require 'ae_easy/core'
5
- require 'ae_easy/text/version'
6
3
 
7
- module AeEasy
8
- module Text
9
- # Create a hash from object
10
- #
11
- # @param [String,Hash,Object] object Object to create hash from.
12
- #
13
- # @return [String]
14
- def self.hash object
15
- object = object.hash if object.is_a? Hash
16
- Digest::SHA1.hexdigest object.to_s
17
- end
18
-
19
- # Encode text for valid HTML entities.
20
- #
21
- # @param [String] text Text to encode.
22
- #
23
- # @return [String]
24
- def self.encode_html text
25
- CGI.escapeHTML text
26
- end
27
-
28
- # Decode HTML entities from text .
29
- #
30
- # @param [String] text Text to decode.
31
- #
32
- # @return [String]
33
- def self.decode_html text
34
- CGI.unescapeHTML text
35
- end
36
-
37
- # Strip a value by trimming spaces, reducing secuential spaces into a
38
- # single space, decode HTML entities and change encoding to UTF-8.
39
- #
40
- # @param [String,Object,nil] raw_text Text to strip.
41
- # @param [String] orig_encoding Text original encoding.
42
- #
43
- # @return [String,nil] `nil` when +raw_text+ is nil, else `String`.
44
- def self.strip raw_text, orig_encoding = 'ASCII'
45
- return nil if raw_text.nil?
46
- raw_text = raw_text.to_s unless raw_text.is_a? String
47
- regex = /(\s|\u3000|\u00a0)+/
48
- good_encoding = (raw_text =~ /\u3000/ || true) rescue false
49
- unless good_encoding
50
- raw_text = raw_text.force_encoding(orig_encoding).encode('UTF-8', invalid: :replace, undef: :replace)
51
- regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
52
- end
53
- text = raw_text.gsub(regex, ' ').strip
54
- text.nil? ? nil : decode_html(text)
55
- end
56
-
57
- # Default cell content parser used to parse cell element.
58
- #
59
- # @param [Nokogiri::Element] cell_element Cell element to parse.
60
- # @param [Hash] data Data hash to save parsed data into.
61
- # @param [String,Symbol] key Header column key being parsed.
62
- def self.default_parser cell_element, data, key
63
- return if cell_element.nil?
64
- cell_element.search('//i').remove if cell_element.search('//i').count > 0
65
- data[key] = strip cell_element.text
66
- end
67
-
68
- # Parse row data matching a selector using a header map to translate
69
- # between columns and friendly keys.
70
- #
71
- # @param [Hash] opts ({}) Configuration options.
72
- # @option opts [Nokogiri::Element] :html Container element to search into.
73
- # @option opts [String] :selector CSS selector to match content cells.
74
- # @option opts [Boolean] :first_row_header (false) If true then first
75
- # matching element will be assumed to be header and ignored.
76
- # @option opts [Hash{Symbol,String => Integer}] :header_map Header key vs
77
- # index dictionary.
78
- # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
79
- # Custom column parsers for advance data extraction.
80
- # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
81
- # retriving content cells and rows.
82
- #
83
- # @yieldparam [Hash{Symbol,String => Object}] data Parsed row data.
84
- # @yieldparam [Array] row Raw row data.
85
- # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
86
- # @yieldreturn [Boolean] `true` when valid, else `false`.
87
- #
88
- # @return [Array<Hash>,nil] Parsed rows data.
89
- def self.parse_content opts, &filter
90
- opts = {
91
- html: nil,
92
- selector: nil,
93
- first_row_header: false,
94
- header_map: {},
95
- column_parsers: {},
96
- ignore_text_nodes: true
97
- }.merge opts
98
-
99
- # Setup config
100
- data = []
101
- row_data = child_element = nil
102
- first = first_row_header = opts[:first_row_header]
103
- header_map = opts[:header_map]
104
- column_parsers = opts[:column_parsers]
105
- ignore_text_nodes = opts[:ignore_text_nodes]
106
-
107
- # Get and parse rows
108
- html_rows = opts[:html].css(opts[:selector])
109
- html_rows.each do |row|
110
- next if ignore_text_nodes && row.name == 'text'
111
-
112
- # First row header validation
113
- if first && first_row_header
114
- first = false
115
- next
116
- end
117
-
118
- # Extract content data
119
- row_data = {}
120
- header_map.each do |key, index|
121
- # Parse column html with default or custom parser
122
- children = row.children
123
- children = children.select{|i|i.name != 'text'} if ignore_text_nodes
124
- child_element = children[index]
125
- column_parsers[key].nil? ?
126
- default_parser(child_element, row_data, key) :
127
- column_parsers[key].call(child_element, row_data, key)
128
- end
129
- next unless filter.nil? || filter.call(row_data, row, header_map)
130
- data << row_data
131
- end
132
- data
133
- end
134
-
135
- # Extract column label and translate it into a frienly key.
136
- #
137
- # @param [Nokogiri::Element] element Html element to parse.
138
- # @param [Hash{Symbol,String => Regex,String}] label_map Label dictionary
139
- # for translation into key.
140
- #
141
- # @return [Symbol,String] Translated key.
142
- def self.translate_label_to_key element, label_map
143
- return nil if element.nil?
144
- element.search('//i').remove if element.search('//i').count > 0
145
- text = strip element.text
146
- key_pair = label_map.find do |k,v|
147
- v.is_a?(Regexp) ? (text =~ v) : (text == v)
148
- end
149
- key = key_pair.nil? ? nil : key_pair[0]
150
- end
151
-
152
- # Parse header from selector and create a header map to match a column key
153
- # with column index.
154
- #
155
- # @param [Hash] opts ({}) Configuration options.
156
- # @option opts [Nokogiri::Element] :html Container element to search into.
157
- # @option opts [String] :selector CSS selector to match header cells.
158
- # @option opts [Hash{Symbol,String => Regex,String}] :column_key_label_map
159
- # Key vs. label dictionary.
160
- # @option opts [Boolean] :first_row_header (false) If true then selector
161
- # first matching row will be used as header for parsing.
162
- # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
163
- # retriving header cells and rows.
164
- #
165
- # @return [Hash{Symbol,String => Integer},nil] Key vs. column index map.
166
- def self.parse_header_map opts = {}
167
- opts = {
168
- html: nil,
169
- selector: nil,
170
- column_key_label_map: {},
171
- first_row_header: false,
172
- ignore_text_nodes: true
173
- }.merge opts
174
-
175
- # Setup config
176
- dictionary = opts[:column_key_label_map]
177
- ignore_text_nodes = opts[:ignore_text_nodes]
178
- data = []
179
- column_map = nil
180
-
181
- # Extract and parse header rows
182
- html_rows = opts[:html].css(opts[:selector]) rescue nil
183
- return nil if html_rows.nil?
184
- html_rows = [html_rows.first] if opts[:first_row_header]
185
- html_rows.each do |row|
186
- next if ignore_text_nodes && row.name == 'text'
187
-
188
- column_map = {}
189
- children = row.children
190
- children = children.select{|i|i.name != 'text'} if ignore_text_nodes
191
- children.each_with_index do |col, index|
192
- # Parse and map column header
193
- column_key = translate_label_to_key col, dictionary
194
- next if column_key.nil?
195
- column_map[column_key] = index
196
- end
197
- data << column_map
198
- end
199
- data&.first
200
- end
201
-
202
- # Parse data from a horizontal table like structure matching a selectors and
203
- # using a header map to match columns.
204
- #
205
- # @param [Hash] opts ({}) Configuration options.
206
- # @option opts [Nokogiri::Element] :html Container element to search into.
207
- # @option opts [String] :header_selector Header column elements selector.
208
- # @option opts [Hash{Symbol,String => Regex,String}] :header_key_label_map
209
- # Header key vs. label dictionary to match column indexes.
210
- # @option opts [String] :content_selector Content row elements selector.
211
- # @option opts [Boolean] :first_row_header (false) If true then selector
212
- # first matching row will be used as header for parsing.
213
- # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
214
- # Custom column parsers for advance data extraction.
215
- # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
216
- # retriving cells and rows.
217
- #
218
- # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
219
- # @yieldparam [Array] row Raw content row data.
220
- # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
221
- # @yieldreturn [Boolean] `true` when valid, else `false`.
222
- #
223
- # @return [Hash{Symbol => Array,Hash,nil}] Hash data is as follows:
224
- # * `[Hash] :header_map` Header map used.
225
- # * `[Array<Hash>,nil] :data` Parsed rows data.
226
- def self.parse_table opts = {}, &filter
227
- opts = {
228
- html: nil,
229
- header_selector: nil,
230
- header_key_label_map: {},
231
- content_selector: nil,
232
- first_row_header: false,
233
- column_parsers: {},
234
- ignore_text_nodes: true
235
- }.merge opts
236
- return nil if opts[:html].nil?
237
- header_map = self.parse_header_map html: opts[:html],
238
- selector: opts[:header_selector],
239
- column_key_label_map: opts[:header_key_label_map],
240
- first_row_header: opts[:first_row_header],
241
- ignore_text_nodes: opts[:ignore_text_nodes]
242
- return nil if header_map.nil?
243
- data = self.parse_content html: opts[:html],
244
- selector: opts[:content_selector],
245
- header_map: header_map,
246
- first_row_header: opts[:first_row_header],
247
- column_parsers: opts[:column_parsers],
248
- ignore_text_nodes: opts[:ignore_text_nodes],
249
- &filter
250
- {header_map: header_map, data: data}
251
- end
252
-
253
- # Parse data from a vertical table like structure matching a selectors and
254
- # using a header map to match columns.
255
- #
256
- # @param [Hash] opts ({}) Configuration options.
257
- # @option opts [Nokogiri::Element] :html Container element to search into.
258
- # @option opts [String] :row_selector Vertical row like elements selector.
259
- # @option opts [String] :header_selector Header column elements selector.
260
- # @option opts [Hash{Symbol,String => Regex,String}] :header_key_label_map
261
- # Header key vs. label dictionary to match column indexes.
262
- # @option opts [String] :content_selector Content row elements selector.
263
- # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
264
- # Custom column parsers for advance data extraction.
265
- # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
266
- # retriving cells and rows.
267
- #
268
- # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
269
- # @yieldparam [Array] row Raw content row data.
270
- # @yieldparam [Hash{Symbol,String => Integer}] header_map Header map used.
271
- # @yieldreturn [Boolean] `true` when valid, else `false`.
272
- #
273
- # @return [Hash{Symbol => Array,Hash,nil}] Hash data is as follows:
274
- # * `[Hash] :header_map` Header map used.
275
- # * `[Array<Hash>,nil] :data` Parsed rows data.
276
- def self.parse_vertical_table opts = {}, &filter
277
- opts = {
278
- html: nil,
279
- row_selector: nil,
280
- header_selector: nil,
281
- header_key_label_map: {},
282
- content_selector: nil,
283
- column_parsers: {},
284
- ignore_text_nodes: true
285
- }.merge opts
286
- return nil if opts[:html].nil?
287
-
288
- # Setup config
289
- data = {}
290
- dictionary = opts[:header_key_label_map]
291
- column_parsers = opts[:column_parsers]
292
-
293
- # Extract headers and content
294
- html_rows = opts[:html].css(opts[:row_selector]) rescue nil
295
- return nil if html_rows.nil?
296
- html_rows.each do |row|
297
- # Parse and map column header
298
- header_element = row.css(opts[:header_selector])
299
- key = translate_label_to_key header_element, dictionary
300
- next if key.nil? || key == ''
301
-
302
- # Parse column html with default or custom parser
303
- content_element = row.css(opts[:content_selector])
304
- column_parsers[key].nil? ?
305
- default_parser(content_element, data, key) :
306
- column_parsers[key].call(content_element, data, key)
307
- end
308
- data
309
- end
310
- end
311
- end
4
+ # (Deprecated) Alias to DhEasy module.
5
+ AeEasy = ::DhEasy unless defined? ::AeEasy
metadata CHANGED
@@ -1,17 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ae_easy-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo Rosales
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-03-11 00:00:00.000000000 Z
11
+ date: 2019-12-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: ae_easy-core
14
+ name: dh_easy-text
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: ae_easy-core
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.2.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.2.1
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,48 +80,6 @@ dependencies:
66
80
  - - "~>"
67
81
  - !ruby/object:Gem::Version
68
82
  version: '5'
69
- - !ruby/object:Gem::Dependency
70
- name: simplecov
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: simplecov-console
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - "~>"
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - "~>"
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: timecop
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - "~>"
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - "~>"
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
83
  - !ruby/object:Gem::Dependency
112
84
  name: byebug
113
85
  requirement: !ruby/object:Gem::Requirement
@@ -122,8 +94,8 @@ dependencies:
122
94
  - - ">="
123
95
  - !ruby/object:Gem::Version
124
96
  version: '0'
125
- description: AnswersEngine Easy toolkit text module contains multiple text parsing
126
- helpers.
97
+ description: "(Deprecated: Use datahen gem instead.) Compatibility alias for Datahen
98
+ Easy toolkit text module contains multiple text parsing helpers."
127
99
  email:
128
100
  - eduardo@datahen.com
129
101
  executables: []
@@ -139,29 +111,12 @@ files:
139
111
  - README.md
140
112
  - Rakefile
141
113
  - ae_easy-text.gemspec
142
- - doc/AeEasy.html
143
- - doc/AeEasy/Text.html
144
- - doc/_index.html
145
- - doc/class_list.html
146
- - doc/css/common.css
147
- - doc/css/full_list.css
148
- - doc/css/style.css
149
- - doc/file.README.html
150
- - doc/file_list.html
151
- - doc/frames.html
152
- - doc/index.html
153
- - doc/js/app.js
154
- - doc/js/full_list.js
155
- - doc/js/jquery.js
156
- - doc/method_list.html
157
- - doc/top-level-namespace.html
158
114
  - lib/ae_easy/text.rb
159
- - lib/ae_easy/text/version.rb
160
- homepage: https://answersengine.com
115
+ homepage: https://datahen.com
161
116
  licenses:
162
117
  - MIT
163
118
  metadata:
164
- homepage_uri: https://answersengine.com
119
+ homepage_uri: https://datahen.com
165
120
  source_code_uri: https://github.com/answersengine/ae_easy-text
166
121
  post_install_message:
167
122
  rdoc_options: []
@@ -182,5 +137,6 @@ rubyforge_project:
182
137
  rubygems_version: 2.7.6
183
138
  signing_key:
184
139
  specification_version: 4
185
- summary: AnswersEngine Easy toolkit text module
140
+ summary: "(Deprecated: Use datahen gem instead.) Compatibility alias for Datahen Easy
141
+ toolkit text module"
186
142
  test_files: []