ae_easy-text 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 239189344e783f67b085da7394e535aa693a4b067c62b8d0b16f733a0b19d4f7
4
- data.tar.gz: ca144105f26e399116b05560ff870f6aa051a04696602f6f68f67f06b9e0bfda
3
+ metadata.gz: 10296214e4de01abc2d77f5a5549c9e4c883009a86915908283bb71dad3bec0b
4
+ data.tar.gz: 9db77d0892191a3dd5c170ffe2257c14559f0721075c88c1fc75ce6fa0f0b04e
5
5
  SHA512:
6
- metadata.gz: 0b7c4495eeb71e5dae3ad799d14f8a2d83989a949183ee3df2837191b4a4f3a10965ead38416ccda078da7b29fc083eb02fd53a24999f97473b02f77489d921c
7
- data.tar.gz: 4f377b26bcfb0ef4cce7806d153fb97de115d0e0bc4beef5e43b55b0125e1936d6c4440d1131cbe9cb4d5fe28a1810c2820269f6c317511068c91aff42ad8126
6
+ metadata.gz: 6582399f34051ebcc5fa5192c22fc984329a04b1ffefb8c2db49ac5e782978b49c3fc4ea5c7fe6c0f0bbdffb4fff1799adb03f79ca92358b13409691e10b6b0d
7
+ data.tar.gz: 174c5533dd32772393fb7a8ba632fb4de8fe5ce8eba6e35d753abc906b295c1659d6cde0cf18e7496b6469a430e7f922335e23f7c3d5d724eba019163a6801f3
data/ae_easy-text.gemspec CHANGED
@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
38
38
  spec.require_paths = ["lib"]
39
39
  spec.required_ruby_version = '>= 2.2.2'
40
40
 
41
- spec.add_dependency 'ae_easy-core', '>= 0'
41
+ spec.add_dependency 'ae_easy-core', '>= 0.1.2'
42
42
  spec.add_development_dependency 'bundler', '>= 1.16.3'
43
43
  spec.add_development_dependency 'rake', '>= 10.0'
44
44
  spec.add_development_dependency 'minitest', '>= 5.11'
data/doc/AeEasy.html CHANGED
@@ -107,7 +107,7 @@
107
107
  </div>
108
108
 
109
109
  <div id="footer">
110
- Generated on Tue Feb 26 16:50:02 2019 by
110
+ Generated on Fri Mar 8 17:26:54 2019 by
111
111
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
112
112
  0.9.18 (ruby-2.5.3).
113
113
  </div>
data/doc/AeEasy/Text.html CHANGED
@@ -108,7 +108,7 @@
108
108
 
109
109
  </div>
110
110
  </dt>
111
- <dd><pre class="code"><span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>0.0.1</span><span class='tstring_end'>&quot;</span></span></pre></dd>
111
+ <dd><pre class="code"><span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>0.0.2</span><span class='tstring_end'>&quot;</span></span></pre></dd>
112
112
 
113
113
  </dl>
114
114
 
@@ -326,7 +326,7 @@ using a header map to match columns.</p>
326
326
  <li class="public ">
327
327
  <span class="summary_signature">
328
328
 
329
- <a href="#strip-class_method" title="strip (class method)">.<strong>strip</strong>(raw_text) &#x21d2; String<sup>?</sup> </a>
329
+ <a href="#strip-class_method" title="strip (class method)">.<strong>strip</strong>(raw_text, orig_encoding = &#39;ASCII&#39;) &#x21d2; String<sup>?</sup> </a>
330
330
 
331
331
 
332
332
 
@@ -341,7 +341,8 @@ using a header map to match columns.</p>
341
341
 
342
342
 
343
343
  <span class="summary_desc"><div class='inline'>
344
- <p>Strip a value.</p>
344
+ <p>Strip a value by trimming spaces, reducing secuential spaces into a
345
+ single space, decode HTML entities and change encoding to UTF-8.</p>
345
346
  </div></span>
346
347
 
347
348
  </li>
@@ -532,17 +533,19 @@ using a header map to match columns.</p>
532
533
  <pre class="lines">
533
534
 
534
535
 
535
- 60
536
- 61
537
536
  62
538
- 63</pre>
537
+ 63
538
+ 64
539
+ 65
540
+ 66</pre>
539
541
  </td>
540
542
  <td>
541
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 60</span>
543
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 62</span>
542
544
 
543
545
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_default_parser'>default_parser</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='comma'>,</span> <span class='id identifier rubyid_data'>data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span>
544
- <span class='id identifier rubyid_cell_element'>cell_element</span><span class='op'>&amp;.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span>
545
- <span class='id identifier rubyid_row_data'>row_data</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='op'>&amp;.</span><span class='id identifier rubyid_text'>text</span>
546
+ <span class='kw'>return</span> <span class='kw'>if</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
547
+ <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span> <span class='kw'>if</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_count'>count</span> <span class='op'>&gt;</span> <span class='int'>0</span>
548
+ <span class='id identifier rubyid_data'>data</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
546
549
  <span class='kw'>end</span></pre>
547
550
  </td>
548
551
  </tr>
@@ -815,6 +818,22 @@ ignored.</p>
815
818
 
816
819
  &mdash; <div class='inline'>
817
820
  <p>Custom column parsers for advance data extraction.</p>
821
+ </div>
822
+
823
+ </li>
824
+
825
+ <li>
826
+ <span class="name">:ignore_text_nodes</span>
827
+ <span class="type">(<tt>Boolean</tt>)</span>
828
+ <span class="default">
829
+
830
+ &mdash; default:
831
+ <tt>true</tt>
832
+
833
+ </span>
834
+
835
+ &mdash; <div class='inline'>
836
+ <p>Ignore text nodes when retriving content cells and rows.</p>
818
837
  </div>
819
838
 
820
839
  </li>
@@ -917,11 +936,6 @@ ignored.</p>
917
936
  <pre class="lines">
918
937
 
919
938
 
920
- 84
921
- 85
922
- 86
923
- 87
924
- 88
925
939
  89
926
940
  90
927
941
  91
@@ -955,10 +969,21 @@ ignored.</p>
955
969
  119
956
970
  120
957
971
  121
958
- 122</pre>
972
+ 122
973
+ 123
974
+ 124
975
+ 125
976
+ 126
977
+ 127
978
+ 128
979
+ 129
980
+ 130
981
+ 131
982
+ 132
983
+ 133</pre>
959
984
  </td>
960
985
  <td>
961
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 84</span>
986
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 89</span>
962
987
 
963
988
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_content'>parse_content</span> <span class='id identifier rubyid_opts'>opts</span><span class='comma'>,</span> <span class='op'>&amp;</span><span class='id identifier rubyid_filter'>filter</span>
964
989
  <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
@@ -966,7 +991,8 @@ ignored.</p>
966
991
  <span class='label'>selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
967
992
  <span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
968
993
  <span class='label'>header_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
969
- <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
994
+ <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
995
+ <span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
970
996
  <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
971
997
 
972
998
  <span class='comment'># Setup config
@@ -975,10 +1001,13 @@ ignored.</p>
975
1001
  <span class='id identifier rubyid_first'>first</span> <span class='op'>=</span> <span class='id identifier rubyid_first_row_header'>first_row_header</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
976
1002
  <span class='id identifier rubyid_header_map'>header_map</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_map</span><span class='rbracket'>]</span>
977
1003
  <span class='id identifier rubyid_column_parsers'>column_parsers</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_parsers</span><span class='rbracket'>]</span>
1004
+ <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
978
1005
 
979
1006
  <span class='comment'># Get and parse rows
980
1007
  </span> <span class='id identifier rubyid_html_rows'>html_rows</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_css'>css</span><span class='lparen'>(</span><span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:selector</span><span class='rbracket'>]</span><span class='rparen'>)</span>
981
1008
  <span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_row'>row</span><span class='op'>|</span>
1009
+ <span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>&amp;&amp;</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>==</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>text</span><span class='tstring_end'>&#39;</span></span>
1010
+
982
1011
  <span class='comment'># First row header validation
983
1012
  </span> <span class='kw'>if</span> <span class='id identifier rubyid_first'>first</span> <span class='op'>&amp;&amp;</span> <span class='id identifier rubyid_first_row_header'>first_row_header</span>
984
1013
  <span class='id identifier rubyid_first'>first</span> <span class='op'>=</span> <span class='kw'>false</span>
@@ -989,7 +1018,9 @@ ignored.</p>
989
1018
  </span> <span class='id identifier rubyid_row_data'>row_data</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
990
1019
  <span class='id identifier rubyid_header_map'>header_map</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_key'>key</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
991
1020
  <span class='comment'># Parse column html with default or custom parser
992
- </span> <span class='id identifier rubyid_child_element'>child_element</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span><span class='lbracket'>[</span><span class='id identifier rubyid_index'>index</span><span class='rbracket'>]</span>
1021
+ </span> <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span>
1022
+ <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_select'>select</span><span class='lbrace'>{</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>!=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>text</span><span class='tstring_end'>&#39;</span></span><span class='rbrace'>}</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span>
1023
+ <span class='id identifier rubyid_child_element'>child_element</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='lbracket'>[</span><span class='id identifier rubyid_index'>index</span><span class='rbracket'>]</span>
993
1024
  <span class='id identifier rubyid_column_parsers'>column_parsers</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span>
994
1025
  <span class='id identifier rubyid_default_parser'>default_parser</span><span class='lparen'>(</span><span class='id identifier rubyid_child_element'>child_element</span><span class='comma'>,</span> <span class='id identifier rubyid_row_data'>row_data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span><span class='rparen'>)</span> <span class='op'>:</span>
995
1026
  <span class='id identifier rubyid_column_parsers'>column_parsers</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_call'>call</span><span class='lparen'>(</span><span class='id identifier rubyid_child_element'>child_element</span><span class='comma'>,</span> <span class='id identifier rubyid_row_data'>row_data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span><span class='rparen'>)</span>
@@ -1106,6 +1137,22 @@ ignored.</p>
1106
1137
  &mdash; <div class='inline'>
1107
1138
  <p>If true then selector first matching row will be used as header for
1108
1139
  parsing.</p>
1140
+ </div>
1141
+
1142
+ </li>
1143
+
1144
+ <li>
1145
+ <span class="name">:ignore_text_nodes</span>
1146
+ <span class="type">(<tt>Boolean</tt>)</span>
1147
+ <span class="default">
1148
+
1149
+ &mdash; default:
1150
+ <tt>true</tt>
1151
+
1152
+ </span>
1153
+
1154
+ &mdash; <div class='inline'>
1155
+ <p>Ignore text nodes when retriving header cells and rows.</p>
1109
1156
  </div>
1110
1157
 
1111
1158
  </li>
@@ -1138,20 +1185,6 @@ parsing.</p>
1138
1185
  <pre class="lines">
1139
1186
 
1140
1187
 
1141
- 152
1142
- 153
1143
- 154
1144
- 155
1145
- 156
1146
- 157
1147
- 158
1148
- 159
1149
- 160
1150
- 161
1151
- 162
1152
- 163
1153
- 164
1154
- 165
1155
1188
  166
1156
1189
  167
1157
1190
  168
@@ -1166,21 +1199,43 @@ parsing.</p>
1166
1199
  177
1167
1200
  178
1168
1201
  179
1169
- 180</pre>
1202
+ 180
1203
+ 181
1204
+ 182
1205
+ 183
1206
+ 184
1207
+ 185
1208
+ 186
1209
+ 187
1210
+ 188
1211
+ 189
1212
+ 190
1213
+ 191
1214
+ 192
1215
+ 193
1216
+ 194
1217
+ 195
1218
+ 196
1219
+ 197
1220
+ 198
1221
+ 199
1222
+ 200</pre>
1170
1223
  </td>
1171
1224
  <td>
1172
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 152</span>
1225
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 166</span>
1173
1226
 
1174
1227
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_header_map'>parse_header_map</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
1175
1228
  <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
1176
1229
  <span class='label'>html:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1177
1230
  <span class='label'>selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1178
1231
  <span class='label'>column_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1179
- <span class='label'>first_row_header:</span> <span class='kw'>false</span>
1232
+ <span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
1233
+ <span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
1180
1234
  <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
1181
1235
 
1182
1236
  <span class='comment'># Setup config
1183
1237
  </span> <span class='id identifier rubyid_dictionary'>dictionary</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_key_label_map</span><span class='rbracket'>]</span>
1238
+ <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
1184
1239
  <span class='id identifier rubyid_data'>data</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='rbracket'>]</span>
1185
1240
  <span class='id identifier rubyid_column_map'>column_map</span> <span class='op'>=</span> <span class='kw'>nil</span>
1186
1241
 
@@ -1189,8 +1244,12 @@ parsing.</p>
1189
1244
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1190
1245
  <span class='id identifier rubyid_html_rows'>html_rows</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_first'>first</span><span class='rbracket'>]</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
1191
1246
  <span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_row'>row</span><span class='op'>|</span>
1247
+ <span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>&amp;&amp;</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>==</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>text</span><span class='tstring_end'>&#39;</span></span>
1248
+
1192
1249
  <span class='id identifier rubyid_column_map'>column_map</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
1193
- <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_each_with_index'>each_with_index</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
1250
+ <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span>
1251
+ <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_select'>select</span><span class='lbrace'>{</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>!=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>text</span><span class='tstring_end'>&#39;</span></span><span class='rbrace'>}</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span>
1252
+ <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_each_with_index'>each_with_index</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
1194
1253
  <span class='comment'># Parse and map column header
1195
1254
  </span> <span class='id identifier rubyid_column_key'>column_key</span> <span class='op'>=</span> <span class='id identifier rubyid_translate_label_to_key'>translate_label_to_key</span> <span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_dictionary'>dictionary</span>
1196
1255
  <span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_column_key'>column_key</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
@@ -1336,6 +1395,22 @@ parsing.</p>
1336
1395
 
1337
1396
  &mdash; <div class='inline'>
1338
1397
  <p>Custom column parsers for advance data extraction.</p>
1398
+ </div>
1399
+
1400
+ </li>
1401
+
1402
+ <li>
1403
+ <span class="name">:ignore_text_nodes</span>
1404
+ <span class="type">(<tt>Boolean</tt>)</span>
1405
+ <span class="default">
1406
+
1407
+ &mdash; default:
1408
+ <tt>true</tt>
1409
+
1410
+ </span>
1411
+
1412
+ &mdash; <div class='inline'>
1413
+ <p>Ignore text nodes when retriving cells and rows.</p>
1339
1414
  </div>
1340
1415
 
1341
1416
  </li>
@@ -1443,32 +1518,35 @@ parsing.</p>
1443
1518
  <pre class="lines">
1444
1519
 
1445
1520
 
1446
- 204
1447
- 205
1448
- 206
1449
- 207
1450
- 208
1451
- 209
1452
- 210
1453
- 211
1454
- 212
1455
- 213
1456
- 214
1457
- 215
1458
- 216
1459
- 217
1460
- 218
1461
- 219
1462
- 220
1463
- 221
1464
- 222
1465
- 223
1466
- 224
1467
- 225
1468
- 226</pre>
1521
+ 226
1522
+ 227
1523
+ 228
1524
+ 229
1525
+ 230
1526
+ 231
1527
+ 232
1528
+ 233
1529
+ 234
1530
+ 235
1531
+ 236
1532
+ 237
1533
+ 238
1534
+ 239
1535
+ 240
1536
+ 241
1537
+ 242
1538
+ 243
1539
+ 244
1540
+ 245
1541
+ 246
1542
+ 247
1543
+ 248
1544
+ 249
1545
+ 250
1546
+ 251</pre>
1469
1547
  </td>
1470
1548
  <td>
1471
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 204</span>
1549
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 226</span>
1472
1550
 
1473
1551
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_table'>parse_table</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span> <span class='op'>&amp;</span><span class='id identifier rubyid_filter'>filter</span>
1474
1552
  <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
@@ -1477,19 +1555,22 @@ parsing.</p>
1477
1555
  <span class='label'>header_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1478
1556
  <span class='label'>content_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1479
1557
  <span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
1480
- <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
1558
+ <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1559
+ <span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
1481
1560
  <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
1482
1561
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1483
1562
  <span class='id identifier rubyid_header_map'>header_map</span> <span class='op'>=</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_header_map'>parse_header_map</span> <span class='label'>html:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='comma'>,</span>
1484
1563
  <span class='label'>selector:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_selector</span><span class='rbracket'>]</span><span class='comma'>,</span>
1485
1564
  <span class='label'>column_key_label_map:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_key_label_map</span><span class='rbracket'>]</span><span class='comma'>,</span>
1486
- <span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
1565
+ <span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span><span class='comma'>,</span>
1566
+ <span class='label'>ignore_text_nodes:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
1487
1567
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1488
1568
  <span class='id identifier rubyid_data'>data</span> <span class='op'>=</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_content'>parse_content</span> <span class='label'>html:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='comma'>,</span>
1489
1569
  <span class='label'>selector:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:content_selector</span><span class='rbracket'>]</span><span class='comma'>,</span>
1490
1570
  <span class='label'>header_map:</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='comma'>,</span>
1491
1571
  <span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span><span class='comma'>,</span>
1492
1572
  <span class='label'>column_parsers:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_parsers</span><span class='rbracket'>]</span><span class='comma'>,</span>
1573
+ <span class='label'>ignore_text_nodes:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span><span class='comma'>,</span>
1493
1574
  <span class='op'>&amp;</span><span class='id identifier rubyid_filter'>filter</span>
1494
1575
  <span class='lbrace'>{</span><span class='label'>header_map:</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='comma'>,</span> <span class='label'>data:</span> <span class='id identifier rubyid_data'>data</span><span class='rbrace'>}</span>
1495
1576
  <span class='kw'>end</span></pre>
@@ -1625,6 +1706,22 @@ parsing.</p>
1625
1706
 
1626
1707
  &mdash; <div class='inline'>
1627
1708
  <p>Custom column parsers for advance data extraction.</p>
1709
+ </div>
1710
+
1711
+ </li>
1712
+
1713
+ <li>
1714
+ <span class="name">:ignore_text_nodes</span>
1715
+ <span class="type">(<tt>Boolean</tt>)</span>
1716
+ <span class="default">
1717
+
1718
+ &mdash; default:
1719
+ <tt>true</tt>
1720
+
1721
+ </span>
1722
+
1723
+ &mdash; <div class='inline'>
1724
+ <p>Ignore text nodes when retriving cells and rows.</p>
1628
1725
  </div>
1629
1726
 
1630
1727
  </li>
@@ -1732,42 +1829,43 @@ parsing.</p>
1732
1829
  <pre class="lines">
1733
1830
 
1734
1831
 
1735
- 249
1736
- 250
1737
- 251
1738
- 252
1739
- 253
1740
- 254
1741
- 255
1742
- 256
1743
- 257
1744
- 258
1745
- 259
1746
- 260
1747
- 261
1748
- 262
1749
- 263
1750
- 264
1751
- 265
1752
- 266
1753
- 267
1754
- 268
1755
- 269
1756
- 270
1757
- 271
1758
- 272
1759
- 273
1760
- 274
1761
- 275
1762
1832
  276
1763
1833
  277
1764
1834
  278
1765
1835
  279
1766
1836
  280
1767
- 281</pre>
1837
+ 281
1838
+ 282
1839
+ 283
1840
+ 284
1841
+ 285
1842
+ 286
1843
+ 287
1844
+ 288
1845
+ 289
1846
+ 290
1847
+ 291
1848
+ 292
1849
+ 293
1850
+ 294
1851
+ 295
1852
+ 296
1853
+ 297
1854
+ 298
1855
+ 299
1856
+ 300
1857
+ 301
1858
+ 302
1859
+ 303
1860
+ 304
1861
+ 305
1862
+ 306
1863
+ 307
1864
+ 308
1865
+ 309</pre>
1768
1866
  </td>
1769
1867
  <td>
1770
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 249</span>
1868
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 276</span>
1771
1869
 
1772
1870
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_vertical_table'>parse_vertical_table</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span> <span class='op'>&amp;</span><span class='id identifier rubyid_filter'>filter</span>
1773
1871
  <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
@@ -1776,7 +1874,8 @@ parsing.</p>
1776
1874
  <span class='label'>header_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1777
1875
  <span class='label'>header_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1778
1876
  <span class='label'>content_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1779
- <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
1877
+ <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1878
+ <span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
1780
1879
  <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
1781
1880
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1782
1881
 
@@ -1810,7 +1909,7 @@ parsing.</p>
1810
1909
  <div class="method_details ">
1811
1910
  <h3 class="signature " id="strip-class_method">
1812
1911
 
1813
- .<strong>strip</strong>(raw_text) &#x21d2; <tt>String</tt><sup>?</sup>
1912
+ .<strong>strip</strong>(raw_text, orig_encoding = &#39;ASCII&#39;) &#x21d2; <tt>String</tt><sup>?</sup>
1814
1913
 
1815
1914
 
1816
1915
 
@@ -1819,7 +1918,10 @@ parsing.</p>
1819
1918
  </h3><div class="docstring">
1820
1919
  <div class="discussion">
1821
1920
 
1822
- <p>Strip a value.</p>
1921
+ <p>Strip a value by trimming spaces, reducing secuential spaces into a</p>
1922
+
1923
+ <pre class="code ruby"><code class="ruby">single space, decode HTML entities and change encoding to UTF-8.
1924
+ </code></pre>
1823
1925
 
1824
1926
 
1825
1927
  </div>
@@ -1840,6 +1942,24 @@ parsing.</p>
1840
1942
  &mdash;
1841
1943
  <div class='inline'>
1842
1944
  <p>Text to strip.</p>
1945
+ </div>
1946
+
1947
+ </li>
1948
+
1949
+ <li>
1950
+
1951
+ <span class='name'>orig_encoding</span>
1952
+
1953
+
1954
+ <span class='type'>(<tt>String</tt>)</span>
1955
+
1956
+
1957
+ <em class="default">(defaults to: <tt>&#39;ASCII&#39;</tt>)</em>
1958
+
1959
+
1960
+ &mdash;
1961
+ <div class='inline'>
1962
+ <p>Text original encoding.</p>
1843
1963
  </div>
1844
1964
 
1845
1965
  </li>
@@ -1871,8 +1991,6 @@ parsing.</p>
1871
1991
  <pre class="lines">
1872
1992
 
1873
1993
 
1874
- 42
1875
- 43
1876
1994
  44
1877
1995
  45
1878
1996
  46
@@ -1882,21 +2000,23 @@ parsing.</p>
1882
2000
  50
1883
2001
  51
1884
2002
  52
1885
- 53</pre>
2003
+ 53
2004
+ 54
2005
+ 55</pre>
1886
2006
  </td>
1887
2007
  <td>
1888
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 42</span>
2008
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 44</span>
1889
2009
 
1890
- <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_raw_text'>raw_text</span>
2010
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='comma'>,</span> <span class='id identifier rubyid_orig_encoding'>orig_encoding</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>ASCII</span><span class='tstring_end'>&#39;</span></span>
1891
2011
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1892
2012
  <span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span> <span class='kw'>unless</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_is_a?'>is_a?</span> <span class='const'>String</span>
1893
2013
  <span class='id identifier rubyid_regex'>regex</span> <span class='op'>=</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>(\s|\u3000|\u00a0)+</span><span class='regexp_end'>/</span></span>
1894
2014
  <span class='id identifier rubyid_good_encoding'>good_encoding</span> <span class='op'>=</span> <span class='lparen'>(</span><span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=~</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\u3000</span><span class='regexp_end'>/</span></span> <span class='op'>||</span> <span class='kw'>true</span><span class='rparen'>)</span> <span class='kw'>rescue</span> <span class='kw'>false</span>
1895
2015
  <span class='kw'>unless</span> <span class='id identifier rubyid_good_encoding'>good_encoding</span>
1896
- <span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_force_encoding'>force_encoding</span><span class='lparen'>(</span><span class='gvar'>$APP_CONFIG</span><span class='lbracket'>[</span><span class='symbol'>:encoding</span><span class='rbracket'>]</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_encode'>encode</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>UTF-8</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
2016
+ <span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_force_encoding'>force_encoding</span><span class='lparen'>(</span><span class='id identifier rubyid_orig_encoding'>orig_encoding</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_encode'>encode</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>UTF-8</span><span class='tstring_end'>&#39;</span></span><span class='comma'>,</span> <span class='label'>invalid:</span> <span class='symbol'>:replace</span><span class='comma'>,</span> <span class='label'>undef:</span> <span class='symbol'>:replace</span><span class='rparen'>)</span>
1897
2017
  <span class='id identifier rubyid_regex'>regex</span> <span class='op'>=</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>(\s|\u3000|\u00a0|\u00c2\u00a0)+</span><span class='regexp_end'>/</span></span>
1898
2018
  <span class='kw'>end</span>
1899
- <span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='op'>&amp;.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='id identifier rubyid_regex'>regex</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'> </span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='op'>&amp;.</span><span class='id identifier rubyid_strip'>strip</span>
2019
+ <span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='id identifier rubyid_regex'>regex</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'> </span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span>
1900
2020
  <span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span> <span class='kw'>nil</span> <span class='op'>:</span> <span class='id identifier rubyid_decode_html'>decode_html</span><span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span><span class='rparen'>)</span>
1901
2021
  <span class='kw'>end</span></pre>
1902
2022
  </td>
@@ -1984,25 +2104,27 @@ parsing.</p>
1984
2104
  <pre class="lines">
1985
2105
 
1986
2106
 
1987
- 131
1988
- 132
1989
- 133
1990
- 134
1991
- 135
1992
- 136
1993
- 137
1994
- 138</pre>
2107
+ 142
2108
+ 143
2109
+ 144
2110
+ 145
2111
+ 146
2112
+ 147
2113
+ 148
2114
+ 149
2115
+ 150</pre>
1995
2116
  </td>
1996
2117
  <td>
1997
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 131</span>
2118
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 142</span>
1998
2119
 
1999
2120
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_translate_label_to_key'>translate_label_to_key</span> <span class='id identifier rubyid_element'>element</span><span class='comma'>,</span> <span class='id identifier rubyid_label_map'>label_map</span>
2000
- <span class='id identifier rubyid_element'>element</span><span class='op'>&amp;.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span>
2001
- <span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_element'>element</span><span class='op'>&amp;.</span><span class='id identifier rubyid_text'>text</span>
2002
- <span class='id identifier rubyid_key'>key</span> <span class='op'>=</span> <span class='id identifier rubyid_label_map'>label_map</span><span class='period'>.</span><span class='id identifier rubyid_find'>find</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_k'>k</span><span class='comma'>,</span><span class='id identifier rubyid_v'>v</span><span class='op'>|</span>
2121
+ <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
2122
+ <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span> <span class='kw'>if</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_count'>count</span> <span class='op'>&gt;</span> <span class='int'>0</span>
2123
+ <span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
2124
+ <span class='id identifier rubyid_key_pair'>key_pair</span> <span class='op'>=</span> <span class='id identifier rubyid_label_map'>label_map</span><span class='period'>.</span><span class='id identifier rubyid_find'>find</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_k'>k</span><span class='comma'>,</span><span class='id identifier rubyid_v'>v</span><span class='op'>|</span>
2003
2125
  <span class='id identifier rubyid_v'>v</span><span class='period'>.</span><span class='id identifier rubyid_is_a?'>is_a?</span><span class='lparen'>(</span><span class='const'>Regexp</span><span class='rparen'>)</span> <span class='op'>?</span> <span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span> <span class='op'>=~</span> <span class='id identifier rubyid_v'>v</span><span class='rparen'>)</span> <span class='op'>:</span> <span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span> <span class='op'>==</span> <span class='id identifier rubyid_v'>v</span><span class='rparen'>)</span>
2004
- <span class='kw'>end</span><span class='op'>&amp;.</span><span class='id identifier rubyid_first'>first</span>
2005
- <span class='id identifier rubyid_key'>key</span>
2126
+ <span class='kw'>end</span>
2127
+ <span class='id identifier rubyid_key'>key</span> <span class='op'>=</span> <span class='id identifier rubyid_key_pair'>key_pair</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span> <span class='kw'>nil</span> <span class='op'>:</span> <span class='id identifier rubyid_key_pair'>key_pair</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
2006
2128
  <span class='kw'>end</span></pre>
2007
2129
  </td>
2008
2130
  </tr>
@@ -2014,7 +2136,7 @@ parsing.</p>
2014
2136
  </div>
2015
2137
 
2016
2138
  <div id="footer">
2017
- Generated on Tue Feb 26 16:50:03 2019 by
2139
+ Generated on Fri Mar 8 17:26:54 2019 by
2018
2140
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
2019
2141
  0.9.18 (ruby-2.5.3).
2020
2142
  </div>
data/doc/_index.html CHANGED
@@ -112,7 +112,7 @@
112
112
  </div>
113
113
 
114
114
  <div id="footer">
115
- Generated on Tue Feb 26 16:50:02 2019 by
115
+ Generated on Fri Mar 8 17:26:54 2019 by
116
116
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
117
117
  0.9.18 (ruby-2.5.3).
118
118
  </div>
data/doc/file.README.html CHANGED
@@ -81,7 +81,7 @@ href="http://rubydoc.org/gems/ae_easy-text/frames">here</a>.</p>
81
81
  </div></div>
82
82
 
83
83
  <div id="footer">
84
- Generated on Tue Feb 26 16:50:02 2019 by
84
+ Generated on Fri Mar 8 17:26:54 2019 by
85
85
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
86
86
  0.9.18 (ruby-2.5.3).
87
87
  </div>
data/doc/index.html CHANGED
@@ -81,7 +81,7 @@ href="http://rubydoc.org/gems/ae_easy-text/frames">here</a>.</p>
81
81
  </div></div>
82
82
 
83
83
  <div id="footer">
84
- Generated on Tue Feb 26 16:50:02 2019 by
84
+ Generated on Fri Mar 8 17:26:54 2019 by
85
85
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
86
86
  0.9.18 (ruby-2.5.3).
87
87
  </div>
@@ -100,7 +100,7 @@
100
100
  </div>
101
101
 
102
102
  <div id="footer">
103
- Generated on Tue Feb 26 16:50:02 2019 by
103
+ Generated on Fri Mar 8 17:26:54 2019 by
104
104
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
105
105
  0.9.18 (ruby-2.5.3).
106
106
  </div>
data/lib/ae_easy/text.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require 'cgi'
2
2
  require 'json'
3
3
  require 'digest/sha1'
4
- require 'ae_easy-core'
4
+ require 'ae_easy/core'
5
5
  require 'ae_easy/text/version'
6
6
 
7
7
  module AeEasy
@@ -34,21 +34,23 @@ module AeEasy
34
34
  CGI.unescapeHTML text
35
35
  end
36
36
 
37
- # Strip a value.
37
+ # Strip a value by trimming spaces, reducing secuential spaces into a
38
+ # single space, decode HTML entities and change encoding to UTF-8.
38
39
  #
39
40
  # @param [String,Object,nil] raw_text Text to strip.
41
+ # @param [String] orig_encoding Text original encoding.
40
42
  #
41
43
  # @return [String,nil] `nil` when +raw_text+ is nil, else `String`.
42
- def self.strip raw_text
44
+ def self.strip raw_text, orig_encoding = 'ASCII'
43
45
  return nil if raw_text.nil?
44
46
  raw_text = raw_text.to_s unless raw_text.is_a? String
45
47
  regex = /(\s|\u3000|\u00a0)+/
46
48
  good_encoding = (raw_text =~ /\u3000/ || true) rescue false
47
49
  unless good_encoding
48
- raw_text = raw_text.force_encoding($APP_CONFIG[:encoding]).encode('UTF-8')
50
+ raw_text = raw_text.force_encoding(orig_encoding).encode('UTF-8', invalid: :replace, undef: :replace)
49
51
  regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
50
52
  end
51
- text = raw_text&.gsub(regex, ' ')&.strip
53
+ text = raw_text.gsub(regex, ' ').strip
52
54
  text.nil? ? nil : decode_html(text)
53
55
  end
54
56
 
@@ -58,8 +60,9 @@ module AeEasy
58
60
  # @param [Hash] data Data hash to save parsed data into.
59
61
  # @param [String,Symbol] key Header column key being parsed.
60
62
  def self.default_parser cell_element, data, key
61
- cell_element&.search('//i').remove
62
- row_data[key] = strip cell_element&.text
63
+ return if cell_element.nil?
64
+ cell_element.search('//i').remove if cell_element.search('//i').count > 0
65
+ data[key] = strip cell_element.text
63
66
  end
64
67
 
65
68
  # Parse row data matching a selector using a header map to translate
@@ -74,6 +77,8 @@ module AeEasy
74
77
  # index dictionary.
75
78
  # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
76
79
  # Custom column parsers for advance data extraction.
80
+ # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
81
+ # retriving content cells and rows.
77
82
  #
78
83
  # @yieldparam [Hash{Symbol,String => Object}] data Parsed row data.
79
84
  # @yieldparam [Array] row Raw row data.
@@ -87,7 +92,8 @@ module AeEasy
87
92
  selector: nil,
88
93
  first_row_header: false,
89
94
  header_map: {},
90
- column_parsers: {}
95
+ column_parsers: {},
96
+ ignore_text_nodes: true
91
97
  }.merge opts
92
98
 
93
99
  # Setup config
@@ -96,10 +102,13 @@ module AeEasy
96
102
  first = first_row_header = opts[:first_row_header]
97
103
  header_map = opts[:header_map]
98
104
  column_parsers = opts[:column_parsers]
105
+ ignore_text_nodes = opts[:ignore_text_nodes]
99
106
 
100
107
  # Get and parse rows
101
108
  html_rows = opts[:html].css(opts[:selector])
102
109
  html_rows.each do |row|
110
+ next if ignore_text_nodes && row.name == 'text'
111
+
103
112
  # First row header validation
104
113
  if first && first_row_header
105
114
  first = false
@@ -110,7 +119,9 @@ module AeEasy
110
119
  row_data = {}
111
120
  header_map.each do |key, index|
112
121
  # Parse column html with default or custom parser
113
- child_element = row.children[index]
122
+ children = row.children
123
+ children = children.select{|i|i.name != 'text'} if ignore_text_nodes
124
+ child_element = children[index]
114
125
  column_parsers[key].nil? ?
115
126
  default_parser(child_element, row_data, key) :
116
127
  column_parsers[key].call(child_element, row_data, key)
@@ -129,12 +140,13 @@ module AeEasy
129
140
  #
130
141
  # @return [Symbol,String] Translated key.
131
142
  def self.translate_label_to_key element, label_map
132
- element&.search('//i').remove
133
- text = strip element&.text
134
- key = label_map.find do |k,v|
143
+ return nil if element.nil?
144
+ element.search('//i').remove if element.search('//i').count > 0
145
+ text = strip element.text
146
+ key_pair = label_map.find do |k,v|
135
147
  v.is_a?(Regexp) ? (text =~ v) : (text == v)
136
- end&.first
137
- key
148
+ end
149
+ key = key_pair.nil? ? nil : key_pair[0]
138
150
  end
139
151
 
140
152
  # Parse header from selector and create a header map to match a column key
@@ -147,6 +159,8 @@ module AeEasy
147
159
  # Key vs. label dictionary.
148
160
  # @option opts [Boolean] :first_row_header (false) If true then selector
149
161
  # first matching row will be used as header for parsing.
162
+ # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
163
+ # retriving header cells and rows.
150
164
  #
151
165
  # @return [Hash{Symbol,String => Integer},nil] Key vs. column index map.
152
166
  def self.parse_header_map opts = {}
@@ -154,11 +168,13 @@ module AeEasy
154
168
  html: nil,
155
169
  selector: nil,
156
170
  column_key_label_map: {},
157
- first_row_header: false
171
+ first_row_header: false,
172
+ ignore_text_nodes: true
158
173
  }.merge opts
159
174
 
160
175
  # Setup config
161
176
  dictionary = opts[:column_key_label_map]
177
+ ignore_text_nodes = opts[:ignore_text_nodes]
162
178
  data = []
163
179
  column_map = nil
164
180
 
@@ -167,8 +183,12 @@ module AeEasy
167
183
  return nil if html_rows.nil?
168
184
  html_rows = [html_rows.first] if opts[:first_row_header]
169
185
  html_rows.each do |row|
186
+ next if ignore_text_nodes && row.name == 'text'
187
+
170
188
  column_map = {}
171
- row.children.each_with_index do |col, index|
189
+ children = row.children
190
+ children = children.select{|i|i.name != 'text'} if ignore_text_nodes
191
+ children.each_with_index do |col, index|
172
192
  # Parse and map column header
173
193
  column_key = translate_label_to_key col, dictionary
174
194
  next if column_key.nil?
@@ -192,6 +212,8 @@ module AeEasy
192
212
  # first matching row will be used as header for parsing.
193
213
  # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
194
214
  # Custom column parsers for advance data extraction.
215
+ # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
216
+ # retriving cells and rows.
195
217
  #
196
218
  # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
197
219
  # @yieldparam [Array] row Raw content row data.
@@ -208,19 +230,22 @@ module AeEasy
208
230
  header_key_label_map: {},
209
231
  content_selector: nil,
210
232
  first_row_header: false,
211
- column_parsers: {}
233
+ column_parsers: {},
234
+ ignore_text_nodes: true
212
235
  }.merge opts
213
236
  return nil if opts[:html].nil?
214
237
  header_map = self.parse_header_map html: opts[:html],
215
238
  selector: opts[:header_selector],
216
239
  column_key_label_map: opts[:header_key_label_map],
217
- first_row_header: opts[:first_row_header]
240
+ first_row_header: opts[:first_row_header],
241
+ ignore_text_nodes: opts[:ignore_text_nodes]
218
242
  return nil if header_map.nil?
219
243
  data = self.parse_content html: opts[:html],
220
244
  selector: opts[:content_selector],
221
245
  header_map: header_map,
222
246
  first_row_header: opts[:first_row_header],
223
247
  column_parsers: opts[:column_parsers],
248
+ ignore_text_nodes: opts[:ignore_text_nodes],
224
249
  &filter
225
250
  {header_map: header_map, data: data}
226
251
  end
@@ -237,6 +262,8 @@ module AeEasy
237
262
  # @option opts [String] :content_selector Content row elements selector.
238
263
  # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
239
264
  # Custom column parsers for advance data extraction.
265
+ # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
266
+ # retriving cells and rows.
240
267
  #
241
268
  # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
242
269
  # @yieldparam [Array] row Raw content row data.
@@ -253,7 +280,8 @@ module AeEasy
253
280
  header_selector: nil,
254
281
  header_key_label_map: {},
255
282
  content_selector: nil,
256
- column_parsers: {}
283
+ column_parsers: {},
284
+ ignore_text_nodes: true
257
285
  }.merge opts
258
286
  return nil if opts[:html].nil?
259
287
 
@@ -1,6 +1,6 @@
1
1
  module AeEasy
2
2
  module Text
3
3
  # Gem version
4
- VERSION = "0.0.1"
4
+ VERSION = "0.0.2"
5
5
  end
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ae_easy-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo Rosales
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-26 00:00:00.000000000 Z
11
+ date: 2019-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ae_easy-core
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: 0.1.2
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: 0.1.2
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement