ae_easy-text 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 239189344e783f67b085da7394e535aa693a4b067c62b8d0b16f733a0b19d4f7
4
- data.tar.gz: ca144105f26e399116b05560ff870f6aa051a04696602f6f68f67f06b9e0bfda
3
+ metadata.gz: 10296214e4de01abc2d77f5a5549c9e4c883009a86915908283bb71dad3bec0b
4
+ data.tar.gz: 9db77d0892191a3dd5c170ffe2257c14559f0721075c88c1fc75ce6fa0f0b04e
5
5
  SHA512:
6
- metadata.gz: 0b7c4495eeb71e5dae3ad799d14f8a2d83989a949183ee3df2837191b4a4f3a10965ead38416ccda078da7b29fc083eb02fd53a24999f97473b02f77489d921c
7
- data.tar.gz: 4f377b26bcfb0ef4cce7806d153fb97de115d0e0bc4beef5e43b55b0125e1936d6c4440d1131cbe9cb4d5fe28a1810c2820269f6c317511068c91aff42ad8126
6
+ metadata.gz: 6582399f34051ebcc5fa5192c22fc984329a04b1ffefb8c2db49ac5e782978b49c3fc4ea5c7fe6c0f0bbdffb4fff1799adb03f79ca92358b13409691e10b6b0d
7
+ data.tar.gz: 174c5533dd32772393fb7a8ba632fb4de8fe5ce8eba6e35d753abc906b295c1659d6cde0cf18e7496b6469a430e7f922335e23f7c3d5d724eba019163a6801f3
data/ae_easy-text.gemspec CHANGED
@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
38
38
  spec.require_paths = ["lib"]
39
39
  spec.required_ruby_version = '>= 2.2.2'
40
40
 
41
- spec.add_dependency 'ae_easy-core', '>= 0'
41
+ spec.add_dependency 'ae_easy-core', '>= 0.1.2'
42
42
  spec.add_development_dependency 'bundler', '>= 1.16.3'
43
43
  spec.add_development_dependency 'rake', '>= 10.0'
44
44
  spec.add_development_dependency 'minitest', '>= 5.11'
data/doc/AeEasy.html CHANGED
@@ -107,7 +107,7 @@
107
107
  </div>
108
108
 
109
109
  <div id="footer">
110
- Generated on Tue Feb 26 16:50:02 2019 by
110
+ Generated on Fri Mar 8 17:26:54 2019 by
111
111
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
112
112
  0.9.18 (ruby-2.5.3).
113
113
  </div>
data/doc/AeEasy/Text.html CHANGED
@@ -108,7 +108,7 @@
108
108
 
109
109
  </div>
110
110
  </dt>
111
- <dd><pre class="code"><span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>0.0.1</span><span class='tstring_end'>&quot;</span></span></pre></dd>
111
+ <dd><pre class="code"><span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>0.0.2</span><span class='tstring_end'>&quot;</span></span></pre></dd>
112
112
 
113
113
  </dl>
114
114
 
@@ -326,7 +326,7 @@ using a header map to match columns.</p>
326
326
  <li class="public ">
327
327
  <span class="summary_signature">
328
328
 
329
- <a href="#strip-class_method" title="strip (class method)">.<strong>strip</strong>(raw_text) &#x21d2; String<sup>?</sup> </a>
329
+ <a href="#strip-class_method" title="strip (class method)">.<strong>strip</strong>(raw_text, orig_encoding = &#39;ASCII&#39;) &#x21d2; String<sup>?</sup> </a>
330
330
 
331
331
 
332
332
 
@@ -341,7 +341,8 @@ using a header map to match columns.</p>
341
341
 
342
342
 
343
343
  <span class="summary_desc"><div class='inline'>
344
- <p>Strip a value.</p>
344
+ <p>Strip a value by trimming spaces, reducing secuential spaces into a
345
+ single space, decode HTML entities and change encoding to UTF-8.</p>
345
346
  </div></span>
346
347
 
347
348
  </li>
@@ -532,17 +533,19 @@ using a header map to match columns.</p>
532
533
  <pre class="lines">
533
534
 
534
535
 
535
- 60
536
- 61
537
536
  62
538
- 63</pre>
537
+ 63
538
+ 64
539
+ 65
540
+ 66</pre>
539
541
  </td>
540
542
  <td>
541
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 60</span>
543
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 62</span>
542
544
 
543
545
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_default_parser'>default_parser</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='comma'>,</span> <span class='id identifier rubyid_data'>data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span>
544
- <span class='id identifier rubyid_cell_element'>cell_element</span><span class='op'>&amp;.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span>
545
- <span class='id identifier rubyid_row_data'>row_data</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='op'>&amp;.</span><span class='id identifier rubyid_text'>text</span>
546
+ <span class='kw'>return</span> <span class='kw'>if</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
547
+ <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span> <span class='kw'>if</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_count'>count</span> <span class='op'>&gt;</span> <span class='int'>0</span>
548
+ <span class='id identifier rubyid_data'>data</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
546
549
  <span class='kw'>end</span></pre>
547
550
  </td>
548
551
  </tr>
@@ -815,6 +818,22 @@ ignored.</p>
815
818
 
816
819
  &mdash; <div class='inline'>
817
820
  <p>Custom column parsers for advance data extraction.</p>
821
+ </div>
822
+
823
+ </li>
824
+
825
+ <li>
826
+ <span class="name">:ignore_text_nodes</span>
827
+ <span class="type">(<tt>Boolean</tt>)</span>
828
+ <span class="default">
829
+
830
+ &mdash; default:
831
+ <tt>true</tt>
832
+
833
+ </span>
834
+
835
+ &mdash; <div class='inline'>
836
+ <p>Ignore text nodes when retriving content cells and rows.</p>
818
837
  </div>
819
838
 
820
839
  </li>
@@ -917,11 +936,6 @@ ignored.</p>
917
936
  <pre class="lines">
918
937
 
919
938
 
920
- 84
921
- 85
922
- 86
923
- 87
924
- 88
925
939
  89
926
940
  90
927
941
  91
@@ -955,10 +969,21 @@ ignored.</p>
955
969
  119
956
970
  120
957
971
  121
958
- 122</pre>
972
+ 122
973
+ 123
974
+ 124
975
+ 125
976
+ 126
977
+ 127
978
+ 128
979
+ 129
980
+ 130
981
+ 131
982
+ 132
983
+ 133</pre>
959
984
  </td>
960
985
  <td>
961
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 84</span>
986
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 89</span>
962
987
 
963
988
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_content'>parse_content</span> <span class='id identifier rubyid_opts'>opts</span><span class='comma'>,</span> <span class='op'>&amp;</span><span class='id identifier rubyid_filter'>filter</span>
964
989
  <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
@@ -966,7 +991,8 @@ ignored.</p>
966
991
  <span class='label'>selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
967
992
  <span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
968
993
  <span class='label'>header_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
969
- <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
994
+ <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
995
+ <span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
970
996
  <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
971
997
 
972
998
  <span class='comment'># Setup config
@@ -975,10 +1001,13 @@ ignored.</p>
975
1001
  <span class='id identifier rubyid_first'>first</span> <span class='op'>=</span> <span class='id identifier rubyid_first_row_header'>first_row_header</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
976
1002
  <span class='id identifier rubyid_header_map'>header_map</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_map</span><span class='rbracket'>]</span>
977
1003
  <span class='id identifier rubyid_column_parsers'>column_parsers</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_parsers</span><span class='rbracket'>]</span>
1004
+ <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
978
1005
 
979
1006
  <span class='comment'># Get and parse rows
980
1007
  </span> <span class='id identifier rubyid_html_rows'>html_rows</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_css'>css</span><span class='lparen'>(</span><span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:selector</span><span class='rbracket'>]</span><span class='rparen'>)</span>
981
1008
  <span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_row'>row</span><span class='op'>|</span>
1009
+ <span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>&amp;&amp;</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>==</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>text</span><span class='tstring_end'>&#39;</span></span>
1010
+
982
1011
  <span class='comment'># First row header validation
983
1012
  </span> <span class='kw'>if</span> <span class='id identifier rubyid_first'>first</span> <span class='op'>&amp;&amp;</span> <span class='id identifier rubyid_first_row_header'>first_row_header</span>
984
1013
  <span class='id identifier rubyid_first'>first</span> <span class='op'>=</span> <span class='kw'>false</span>
@@ -989,7 +1018,9 @@ ignored.</p>
989
1018
  </span> <span class='id identifier rubyid_row_data'>row_data</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
990
1019
  <span class='id identifier rubyid_header_map'>header_map</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_key'>key</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
991
1020
  <span class='comment'># Parse column html with default or custom parser
992
- </span> <span class='id identifier rubyid_child_element'>child_element</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span><span class='lbracket'>[</span><span class='id identifier rubyid_index'>index</span><span class='rbracket'>]</span>
1021
+ </span> <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span>
1022
+ <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_select'>select</span><span class='lbrace'>{</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>!=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>text</span><span class='tstring_end'>&#39;</span></span><span class='rbrace'>}</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span>
1023
+ <span class='id identifier rubyid_child_element'>child_element</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='lbracket'>[</span><span class='id identifier rubyid_index'>index</span><span class='rbracket'>]</span>
993
1024
  <span class='id identifier rubyid_column_parsers'>column_parsers</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span>
994
1025
  <span class='id identifier rubyid_default_parser'>default_parser</span><span class='lparen'>(</span><span class='id identifier rubyid_child_element'>child_element</span><span class='comma'>,</span> <span class='id identifier rubyid_row_data'>row_data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span><span class='rparen'>)</span> <span class='op'>:</span>
995
1026
  <span class='id identifier rubyid_column_parsers'>column_parsers</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_call'>call</span><span class='lparen'>(</span><span class='id identifier rubyid_child_element'>child_element</span><span class='comma'>,</span> <span class='id identifier rubyid_row_data'>row_data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span><span class='rparen'>)</span>
@@ -1106,6 +1137,22 @@ ignored.</p>
1106
1137
  &mdash; <div class='inline'>
1107
1138
  <p>If true then selector first matching row will be used as header for
1108
1139
  parsing.</p>
1140
+ </div>
1141
+
1142
+ </li>
1143
+
1144
+ <li>
1145
+ <span class="name">:ignore_text_nodes</span>
1146
+ <span class="type">(<tt>Boolean</tt>)</span>
1147
+ <span class="default">
1148
+
1149
+ &mdash; default:
1150
+ <tt>true</tt>
1151
+
1152
+ </span>
1153
+
1154
+ &mdash; <div class='inline'>
1155
+ <p>Ignore text nodes when retriving header cells and rows.</p>
1109
1156
  </div>
1110
1157
 
1111
1158
  </li>
@@ -1138,20 +1185,6 @@ parsing.</p>
1138
1185
  <pre class="lines">
1139
1186
 
1140
1187
 
1141
- 152
1142
- 153
1143
- 154
1144
- 155
1145
- 156
1146
- 157
1147
- 158
1148
- 159
1149
- 160
1150
- 161
1151
- 162
1152
- 163
1153
- 164
1154
- 165
1155
1188
  166
1156
1189
  167
1157
1190
  168
@@ -1166,21 +1199,43 @@ parsing.</p>
1166
1199
  177
1167
1200
  178
1168
1201
  179
1169
- 180</pre>
1202
+ 180
1203
+ 181
1204
+ 182
1205
+ 183
1206
+ 184
1207
+ 185
1208
+ 186
1209
+ 187
1210
+ 188
1211
+ 189
1212
+ 190
1213
+ 191
1214
+ 192
1215
+ 193
1216
+ 194
1217
+ 195
1218
+ 196
1219
+ 197
1220
+ 198
1221
+ 199
1222
+ 200</pre>
1170
1223
  </td>
1171
1224
  <td>
1172
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 152</span>
1225
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 166</span>
1173
1226
 
1174
1227
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_header_map'>parse_header_map</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
1175
1228
  <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
1176
1229
  <span class='label'>html:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1177
1230
  <span class='label'>selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1178
1231
  <span class='label'>column_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1179
- <span class='label'>first_row_header:</span> <span class='kw'>false</span>
1232
+ <span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
1233
+ <span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
1180
1234
  <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
1181
1235
 
1182
1236
  <span class='comment'># Setup config
1183
1237
  </span> <span class='id identifier rubyid_dictionary'>dictionary</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_key_label_map</span><span class='rbracket'>]</span>
1238
+ <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
1184
1239
  <span class='id identifier rubyid_data'>data</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='rbracket'>]</span>
1185
1240
  <span class='id identifier rubyid_column_map'>column_map</span> <span class='op'>=</span> <span class='kw'>nil</span>
1186
1241
 
@@ -1189,8 +1244,12 @@ parsing.</p>
1189
1244
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1190
1245
  <span class='id identifier rubyid_html_rows'>html_rows</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_first'>first</span><span class='rbracket'>]</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
1191
1246
  <span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_row'>row</span><span class='op'>|</span>
1247
+ <span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>&amp;&amp;</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>==</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>text</span><span class='tstring_end'>&#39;</span></span>
1248
+
1192
1249
  <span class='id identifier rubyid_column_map'>column_map</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
1193
- <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_each_with_index'>each_with_index</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
1250
+ <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span>
1251
+ <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_select'>select</span><span class='lbrace'>{</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>!=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>text</span><span class='tstring_end'>&#39;</span></span><span class='rbrace'>}</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span>
1252
+ <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_each_with_index'>each_with_index</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
1194
1253
  <span class='comment'># Parse and map column header
1195
1254
  </span> <span class='id identifier rubyid_column_key'>column_key</span> <span class='op'>=</span> <span class='id identifier rubyid_translate_label_to_key'>translate_label_to_key</span> <span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_dictionary'>dictionary</span>
1196
1255
  <span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_column_key'>column_key</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
@@ -1336,6 +1395,22 @@ parsing.</p>
1336
1395
 
1337
1396
  &mdash; <div class='inline'>
1338
1397
  <p>Custom column parsers for advance data extraction.</p>
1398
+ </div>
1399
+
1400
+ </li>
1401
+
1402
+ <li>
1403
+ <span class="name">:ignore_text_nodes</span>
1404
+ <span class="type">(<tt>Boolean</tt>)</span>
1405
+ <span class="default">
1406
+
1407
+ &mdash; default:
1408
+ <tt>true</tt>
1409
+
1410
+ </span>
1411
+
1412
+ &mdash; <div class='inline'>
1413
+ <p>Ignore text nodes when retriving cells and rows.</p>
1339
1414
  </div>
1340
1415
 
1341
1416
  </li>
@@ -1443,32 +1518,35 @@ parsing.</p>
1443
1518
  <pre class="lines">
1444
1519
 
1445
1520
 
1446
- 204
1447
- 205
1448
- 206
1449
- 207
1450
- 208
1451
- 209
1452
- 210
1453
- 211
1454
- 212
1455
- 213
1456
- 214
1457
- 215
1458
- 216
1459
- 217
1460
- 218
1461
- 219
1462
- 220
1463
- 221
1464
- 222
1465
- 223
1466
- 224
1467
- 225
1468
- 226</pre>
1521
+ 226
1522
+ 227
1523
+ 228
1524
+ 229
1525
+ 230
1526
+ 231
1527
+ 232
1528
+ 233
1529
+ 234
1530
+ 235
1531
+ 236
1532
+ 237
1533
+ 238
1534
+ 239
1535
+ 240
1536
+ 241
1537
+ 242
1538
+ 243
1539
+ 244
1540
+ 245
1541
+ 246
1542
+ 247
1543
+ 248
1544
+ 249
1545
+ 250
1546
+ 251</pre>
1469
1547
  </td>
1470
1548
  <td>
1471
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 204</span>
1549
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 226</span>
1472
1550
 
1473
1551
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_table'>parse_table</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span> <span class='op'>&amp;</span><span class='id identifier rubyid_filter'>filter</span>
1474
1552
  <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
@@ -1477,19 +1555,22 @@ parsing.</p>
1477
1555
  <span class='label'>header_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1478
1556
  <span class='label'>content_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1479
1557
  <span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
1480
- <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
1558
+ <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1559
+ <span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
1481
1560
  <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
1482
1561
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1483
1562
  <span class='id identifier rubyid_header_map'>header_map</span> <span class='op'>=</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_header_map'>parse_header_map</span> <span class='label'>html:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='comma'>,</span>
1484
1563
  <span class='label'>selector:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_selector</span><span class='rbracket'>]</span><span class='comma'>,</span>
1485
1564
  <span class='label'>column_key_label_map:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_key_label_map</span><span class='rbracket'>]</span><span class='comma'>,</span>
1486
- <span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
1565
+ <span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span><span class='comma'>,</span>
1566
+ <span class='label'>ignore_text_nodes:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
1487
1567
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1488
1568
  <span class='id identifier rubyid_data'>data</span> <span class='op'>=</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_content'>parse_content</span> <span class='label'>html:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='comma'>,</span>
1489
1569
  <span class='label'>selector:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:content_selector</span><span class='rbracket'>]</span><span class='comma'>,</span>
1490
1570
  <span class='label'>header_map:</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='comma'>,</span>
1491
1571
  <span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span><span class='comma'>,</span>
1492
1572
  <span class='label'>column_parsers:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_parsers</span><span class='rbracket'>]</span><span class='comma'>,</span>
1573
+ <span class='label'>ignore_text_nodes:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span><span class='comma'>,</span>
1493
1574
  <span class='op'>&amp;</span><span class='id identifier rubyid_filter'>filter</span>
1494
1575
  <span class='lbrace'>{</span><span class='label'>header_map:</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='comma'>,</span> <span class='label'>data:</span> <span class='id identifier rubyid_data'>data</span><span class='rbrace'>}</span>
1495
1576
  <span class='kw'>end</span></pre>
@@ -1625,6 +1706,22 @@ parsing.</p>
1625
1706
 
1626
1707
  &mdash; <div class='inline'>
1627
1708
  <p>Custom column parsers for advance data extraction.</p>
1709
+ </div>
1710
+
1711
+ </li>
1712
+
1713
+ <li>
1714
+ <span class="name">:ignore_text_nodes</span>
1715
+ <span class="type">(<tt>Boolean</tt>)</span>
1716
+ <span class="default">
1717
+
1718
+ &mdash; default:
1719
+ <tt>true</tt>
1720
+
1721
+ </span>
1722
+
1723
+ &mdash; <div class='inline'>
1724
+ <p>Ignore text nodes when retriving cells and rows.</p>
1628
1725
  </div>
1629
1726
 
1630
1727
  </li>
@@ -1732,42 +1829,43 @@ parsing.</p>
1732
1829
  <pre class="lines">
1733
1830
 
1734
1831
 
1735
- 249
1736
- 250
1737
- 251
1738
- 252
1739
- 253
1740
- 254
1741
- 255
1742
- 256
1743
- 257
1744
- 258
1745
- 259
1746
- 260
1747
- 261
1748
- 262
1749
- 263
1750
- 264
1751
- 265
1752
- 266
1753
- 267
1754
- 268
1755
- 269
1756
- 270
1757
- 271
1758
- 272
1759
- 273
1760
- 274
1761
- 275
1762
1832
  276
1763
1833
  277
1764
1834
  278
1765
1835
  279
1766
1836
  280
1767
- 281</pre>
1837
+ 281
1838
+ 282
1839
+ 283
1840
+ 284
1841
+ 285
1842
+ 286
1843
+ 287
1844
+ 288
1845
+ 289
1846
+ 290
1847
+ 291
1848
+ 292
1849
+ 293
1850
+ 294
1851
+ 295
1852
+ 296
1853
+ 297
1854
+ 298
1855
+ 299
1856
+ 300
1857
+ 301
1858
+ 302
1859
+ 303
1860
+ 304
1861
+ 305
1862
+ 306
1863
+ 307
1864
+ 308
1865
+ 309</pre>
1768
1866
  </td>
1769
1867
  <td>
1770
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 249</span>
1868
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 276</span>
1771
1869
 
1772
1870
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_vertical_table'>parse_vertical_table</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span> <span class='op'>&amp;</span><span class='id identifier rubyid_filter'>filter</span>
1773
1871
  <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
@@ -1776,7 +1874,8 @@ parsing.</p>
1776
1874
  <span class='label'>header_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1777
1875
  <span class='label'>header_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1778
1876
  <span class='label'>content_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
1779
- <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
1877
+ <span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
1878
+ <span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
1780
1879
  <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
1781
1880
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1782
1881
 
@@ -1810,7 +1909,7 @@ parsing.</p>
1810
1909
  <div class="method_details ">
1811
1910
  <h3 class="signature " id="strip-class_method">
1812
1911
 
1813
- .<strong>strip</strong>(raw_text) &#x21d2; <tt>String</tt><sup>?</sup>
1912
+ .<strong>strip</strong>(raw_text, orig_encoding = &#39;ASCII&#39;) &#x21d2; <tt>String</tt><sup>?</sup>
1814
1913
 
1815
1914
 
1816
1915
 
@@ -1819,7 +1918,10 @@ parsing.</p>
1819
1918
  </h3><div class="docstring">
1820
1919
  <div class="discussion">
1821
1920
 
1822
- <p>Strip a value.</p>
1921
+ <p>Strip a value by trimming spaces, reducing secuential spaces into a</p>
1922
+
1923
+ <pre class="code ruby"><code class="ruby">single space, decode HTML entities and change encoding to UTF-8.
1924
+ </code></pre>
1823
1925
 
1824
1926
 
1825
1927
  </div>
@@ -1840,6 +1942,24 @@ parsing.</p>
1840
1942
  &mdash;
1841
1943
  <div class='inline'>
1842
1944
  <p>Text to strip.</p>
1945
+ </div>
1946
+
1947
+ </li>
1948
+
1949
+ <li>
1950
+
1951
+ <span class='name'>orig_encoding</span>
1952
+
1953
+
1954
+ <span class='type'>(<tt>String</tt>)</span>
1955
+
1956
+
1957
+ <em class="default">(defaults to: <tt>&#39;ASCII&#39;</tt>)</em>
1958
+
1959
+
1960
+ &mdash;
1961
+ <div class='inline'>
1962
+ <p>Text original encoding.</p>
1843
1963
  </div>
1844
1964
 
1845
1965
  </li>
@@ -1871,8 +1991,6 @@ parsing.</p>
1871
1991
  <pre class="lines">
1872
1992
 
1873
1993
 
1874
- 42
1875
- 43
1876
1994
  44
1877
1995
  45
1878
1996
  46
@@ -1882,21 +2000,23 @@ parsing.</p>
1882
2000
  50
1883
2001
  51
1884
2002
  52
1885
- 53</pre>
2003
+ 53
2004
+ 54
2005
+ 55</pre>
1886
2006
  </td>
1887
2007
  <td>
1888
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 42</span>
2008
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 44</span>
1889
2009
 
1890
- <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_raw_text'>raw_text</span>
2010
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='comma'>,</span> <span class='id identifier rubyid_orig_encoding'>orig_encoding</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>ASCII</span><span class='tstring_end'>&#39;</span></span>
1891
2011
  <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
1892
2012
  <span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span> <span class='kw'>unless</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_is_a?'>is_a?</span> <span class='const'>String</span>
1893
2013
  <span class='id identifier rubyid_regex'>regex</span> <span class='op'>=</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>(\s|\u3000|\u00a0)+</span><span class='regexp_end'>/</span></span>
1894
2014
  <span class='id identifier rubyid_good_encoding'>good_encoding</span> <span class='op'>=</span> <span class='lparen'>(</span><span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=~</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\u3000</span><span class='regexp_end'>/</span></span> <span class='op'>||</span> <span class='kw'>true</span><span class='rparen'>)</span> <span class='kw'>rescue</span> <span class='kw'>false</span>
1895
2015
  <span class='kw'>unless</span> <span class='id identifier rubyid_good_encoding'>good_encoding</span>
1896
- <span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_force_encoding'>force_encoding</span><span class='lparen'>(</span><span class='gvar'>$APP_CONFIG</span><span class='lbracket'>[</span><span class='symbol'>:encoding</span><span class='rbracket'>]</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_encode'>encode</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>UTF-8</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
2016
+ <span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_force_encoding'>force_encoding</span><span class='lparen'>(</span><span class='id identifier rubyid_orig_encoding'>orig_encoding</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_encode'>encode</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>UTF-8</span><span class='tstring_end'>&#39;</span></span><span class='comma'>,</span> <span class='label'>invalid:</span> <span class='symbol'>:replace</span><span class='comma'>,</span> <span class='label'>undef:</span> <span class='symbol'>:replace</span><span class='rparen'>)</span>
1897
2017
  <span class='id identifier rubyid_regex'>regex</span> <span class='op'>=</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>(\s|\u3000|\u00a0|\u00c2\u00a0)+</span><span class='regexp_end'>/</span></span>
1898
2018
  <span class='kw'>end</span>
1899
- <span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='op'>&amp;.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='id identifier rubyid_regex'>regex</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'> </span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='op'>&amp;.</span><span class='id identifier rubyid_strip'>strip</span>
2019
+ <span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='id identifier rubyid_regex'>regex</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'> </span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span>
1900
2020
  <span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span> <span class='kw'>nil</span> <span class='op'>:</span> <span class='id identifier rubyid_decode_html'>decode_html</span><span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span><span class='rparen'>)</span>
1901
2021
  <span class='kw'>end</span></pre>
1902
2022
  </td>
@@ -1984,25 +2104,27 @@ parsing.</p>
1984
2104
  <pre class="lines">
1985
2105
 
1986
2106
 
1987
- 131
1988
- 132
1989
- 133
1990
- 134
1991
- 135
1992
- 136
1993
- 137
1994
- 138</pre>
2107
+ 142
2108
+ 143
2109
+ 144
2110
+ 145
2111
+ 146
2112
+ 147
2113
+ 148
2114
+ 149
2115
+ 150</pre>
1995
2116
  </td>
1996
2117
  <td>
1997
- <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 131</span>
2118
+ <pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 142</span>
1998
2119
 
1999
2120
  <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_translate_label_to_key'>translate_label_to_key</span> <span class='id identifier rubyid_element'>element</span><span class='comma'>,</span> <span class='id identifier rubyid_label_map'>label_map</span>
2000
- <span class='id identifier rubyid_element'>element</span><span class='op'>&amp;.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span>
2001
- <span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_element'>element</span><span class='op'>&amp;.</span><span class='id identifier rubyid_text'>text</span>
2002
- <span class='id identifier rubyid_key'>key</span> <span class='op'>=</span> <span class='id identifier rubyid_label_map'>label_map</span><span class='period'>.</span><span class='id identifier rubyid_find'>find</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_k'>k</span><span class='comma'>,</span><span class='id identifier rubyid_v'>v</span><span class='op'>|</span>
2121
+ <span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
2122
+ <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span> <span class='kw'>if</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//i</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_count'>count</span> <span class='op'>&gt;</span> <span class='int'>0</span>
2123
+ <span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
2124
+ <span class='id identifier rubyid_key_pair'>key_pair</span> <span class='op'>=</span> <span class='id identifier rubyid_label_map'>label_map</span><span class='period'>.</span><span class='id identifier rubyid_find'>find</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_k'>k</span><span class='comma'>,</span><span class='id identifier rubyid_v'>v</span><span class='op'>|</span>
2003
2125
  <span class='id identifier rubyid_v'>v</span><span class='period'>.</span><span class='id identifier rubyid_is_a?'>is_a?</span><span class='lparen'>(</span><span class='const'>Regexp</span><span class='rparen'>)</span> <span class='op'>?</span> <span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span> <span class='op'>=~</span> <span class='id identifier rubyid_v'>v</span><span class='rparen'>)</span> <span class='op'>:</span> <span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span> <span class='op'>==</span> <span class='id identifier rubyid_v'>v</span><span class='rparen'>)</span>
2004
- <span class='kw'>end</span><span class='op'>&amp;.</span><span class='id identifier rubyid_first'>first</span>
2005
- <span class='id identifier rubyid_key'>key</span>
2126
+ <span class='kw'>end</span>
2127
+ <span class='id identifier rubyid_key'>key</span> <span class='op'>=</span> <span class='id identifier rubyid_key_pair'>key_pair</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span> <span class='kw'>nil</span> <span class='op'>:</span> <span class='id identifier rubyid_key_pair'>key_pair</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
2006
2128
  <span class='kw'>end</span></pre>
2007
2129
  </td>
2008
2130
  </tr>
@@ -2014,7 +2136,7 @@ parsing.</p>
2014
2136
  </div>
2015
2137
 
2016
2138
  <div id="footer">
2017
- Generated on Tue Feb 26 16:50:03 2019 by
2139
+ Generated on Fri Mar 8 17:26:54 2019 by
2018
2140
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
2019
2141
  0.9.18 (ruby-2.5.3).
2020
2142
  </div>
data/doc/_index.html CHANGED
@@ -112,7 +112,7 @@
112
112
  </div>
113
113
 
114
114
  <div id="footer">
115
- Generated on Tue Feb 26 16:50:02 2019 by
115
+ Generated on Fri Mar 8 17:26:54 2019 by
116
116
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
117
117
  0.9.18 (ruby-2.5.3).
118
118
  </div>
data/doc/file.README.html CHANGED
@@ -81,7 +81,7 @@ href="http://rubydoc.org/gems/ae_easy-text/frames">here</a>.</p>
81
81
  </div></div>
82
82
 
83
83
  <div id="footer">
84
- Generated on Tue Feb 26 16:50:02 2019 by
84
+ Generated on Fri Mar 8 17:26:54 2019 by
85
85
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
86
86
  0.9.18 (ruby-2.5.3).
87
87
  </div>
data/doc/index.html CHANGED
@@ -81,7 +81,7 @@ href="http://rubydoc.org/gems/ae_easy-text/frames">here</a>.</p>
81
81
  </div></div>
82
82
 
83
83
  <div id="footer">
84
- Generated on Tue Feb 26 16:50:02 2019 by
84
+ Generated on Fri Mar 8 17:26:54 2019 by
85
85
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
86
86
  0.9.18 (ruby-2.5.3).
87
87
  </div>
@@ -100,7 +100,7 @@
100
100
  </div>
101
101
 
102
102
  <div id="footer">
103
- Generated on Tue Feb 26 16:50:02 2019 by
103
+ Generated on Fri Mar 8 17:26:54 2019 by
104
104
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
105
105
  0.9.18 (ruby-2.5.3).
106
106
  </div>
data/lib/ae_easy/text.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require 'cgi'
2
2
  require 'json'
3
3
  require 'digest/sha1'
4
- require 'ae_easy-core'
4
+ require 'ae_easy/core'
5
5
  require 'ae_easy/text/version'
6
6
 
7
7
  module AeEasy
@@ -34,21 +34,23 @@ module AeEasy
34
34
  CGI.unescapeHTML text
35
35
  end
36
36
 
37
- # Strip a value.
37
+ # Strip a value by trimming spaces, reducing secuential spaces into a
38
+ # single space, decode HTML entities and change encoding to UTF-8.
38
39
  #
39
40
  # @param [String,Object,nil] raw_text Text to strip.
41
+ # @param [String] orig_encoding Text original encoding.
40
42
  #
41
43
  # @return [String,nil] `nil` when +raw_text+ is nil, else `String`.
42
- def self.strip raw_text
44
+ def self.strip raw_text, orig_encoding = 'ASCII'
43
45
  return nil if raw_text.nil?
44
46
  raw_text = raw_text.to_s unless raw_text.is_a? String
45
47
  regex = /(\s|\u3000|\u00a0)+/
46
48
  good_encoding = (raw_text =~ /\u3000/ || true) rescue false
47
49
  unless good_encoding
48
- raw_text = raw_text.force_encoding($APP_CONFIG[:encoding]).encode('UTF-8')
50
+ raw_text = raw_text.force_encoding(orig_encoding).encode('UTF-8', invalid: :replace, undef: :replace)
49
51
  regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
50
52
  end
51
- text = raw_text&.gsub(regex, ' ')&.strip
53
+ text = raw_text.gsub(regex, ' ').strip
52
54
  text.nil? ? nil : decode_html(text)
53
55
  end
54
56
 
@@ -58,8 +60,9 @@ module AeEasy
58
60
  # @param [Hash] data Data hash to save parsed data into.
59
61
  # @param [String,Symbol] key Header column key being parsed.
60
62
  def self.default_parser cell_element, data, key
61
- cell_element&.search('//i').remove
62
- row_data[key] = strip cell_element&.text
63
+ return if cell_element.nil?
64
+ cell_element.search('//i').remove if cell_element.search('//i').count > 0
65
+ data[key] = strip cell_element.text
63
66
  end
64
67
 
65
68
  # Parse row data matching a selector using a header map to translate
@@ -74,6 +77,8 @@ module AeEasy
74
77
  # index dictionary.
75
78
  # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
76
79
  # Custom column parsers for advance data extraction.
80
+ # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
81
+ # retriving content cells and rows.
77
82
  #
78
83
  # @yieldparam [Hash{Symbol,String => Object}] data Parsed row data.
79
84
  # @yieldparam [Array] row Raw row data.
@@ -87,7 +92,8 @@ module AeEasy
87
92
  selector: nil,
88
93
  first_row_header: false,
89
94
  header_map: {},
90
- column_parsers: {}
95
+ column_parsers: {},
96
+ ignore_text_nodes: true
91
97
  }.merge opts
92
98
 
93
99
  # Setup config
@@ -96,10 +102,13 @@ module AeEasy
96
102
  first = first_row_header = opts[:first_row_header]
97
103
  header_map = opts[:header_map]
98
104
  column_parsers = opts[:column_parsers]
105
+ ignore_text_nodes = opts[:ignore_text_nodes]
99
106
 
100
107
  # Get and parse rows
101
108
  html_rows = opts[:html].css(opts[:selector])
102
109
  html_rows.each do |row|
110
+ next if ignore_text_nodes && row.name == 'text'
111
+
103
112
  # First row header validation
104
113
  if first && first_row_header
105
114
  first = false
@@ -110,7 +119,9 @@ module AeEasy
110
119
  row_data = {}
111
120
  header_map.each do |key, index|
112
121
  # Parse column html with default or custom parser
113
- child_element = row.children[index]
122
+ children = row.children
123
+ children = children.select{|i|i.name != 'text'} if ignore_text_nodes
124
+ child_element = children[index]
114
125
  column_parsers[key].nil? ?
115
126
  default_parser(child_element, row_data, key) :
116
127
  column_parsers[key].call(child_element, row_data, key)
@@ -129,12 +140,13 @@ module AeEasy
129
140
  #
130
141
  # @return [Symbol,String] Translated key.
131
142
  def self.translate_label_to_key element, label_map
132
- element&.search('//i').remove
133
- text = strip element&.text
134
- key = label_map.find do |k,v|
143
+ return nil if element.nil?
144
+ element.search('//i').remove if element.search('//i').count > 0
145
+ text = strip element.text
146
+ key_pair = label_map.find do |k,v|
135
147
  v.is_a?(Regexp) ? (text =~ v) : (text == v)
136
- end&.first
137
- key
148
+ end
149
+ key = key_pair.nil? ? nil : key_pair[0]
138
150
  end
139
151
 
140
152
  # Parse header from selector and create a header map to match a column key
@@ -147,6 +159,8 @@ module AeEasy
147
159
  # Key vs. label dictionary.
148
160
  # @option opts [Boolean] :first_row_header (false) If true then selector
149
161
  # first matching row will be used as header for parsing.
162
+ # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
163
+ # retriving header cells and rows.
150
164
  #
151
165
  # @return [Hash{Symbol,String => Integer},nil] Key vs. column index map.
152
166
  def self.parse_header_map opts = {}
@@ -154,11 +168,13 @@ module AeEasy
154
168
  html: nil,
155
169
  selector: nil,
156
170
  column_key_label_map: {},
157
- first_row_header: false
171
+ first_row_header: false,
172
+ ignore_text_nodes: true
158
173
  }.merge opts
159
174
 
160
175
  # Setup config
161
176
  dictionary = opts[:column_key_label_map]
177
+ ignore_text_nodes = opts[:ignore_text_nodes]
162
178
  data = []
163
179
  column_map = nil
164
180
 
@@ -167,8 +183,12 @@ module AeEasy
167
183
  return nil if html_rows.nil?
168
184
  html_rows = [html_rows.first] if opts[:first_row_header]
169
185
  html_rows.each do |row|
186
+ next if ignore_text_nodes && row.name == 'text'
187
+
170
188
  column_map = {}
171
- row.children.each_with_index do |col, index|
189
+ children = row.children
190
+ children = children.select{|i|i.name != 'text'} if ignore_text_nodes
191
+ children.each_with_index do |col, index|
172
192
  # Parse and map column header
173
193
  column_key = translate_label_to_key col, dictionary
174
194
  next if column_key.nil?
@@ -192,6 +212,8 @@ module AeEasy
192
212
  # first matching row will be used as header for parsing.
193
213
  # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
194
214
  # Custom column parsers for advance data extraction.
215
+ # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
216
+ # retriving cells and rows.
195
217
  #
196
218
  # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
197
219
  # @yieldparam [Array] row Raw content row data.
@@ -208,19 +230,22 @@ module AeEasy
208
230
  header_key_label_map: {},
209
231
  content_selector: nil,
210
232
  first_row_header: false,
211
- column_parsers: {}
233
+ column_parsers: {},
234
+ ignore_text_nodes: true
212
235
  }.merge opts
213
236
  return nil if opts[:html].nil?
214
237
  header_map = self.parse_header_map html: opts[:html],
215
238
  selector: opts[:header_selector],
216
239
  column_key_label_map: opts[:header_key_label_map],
217
- first_row_header: opts[:first_row_header]
240
+ first_row_header: opts[:first_row_header],
241
+ ignore_text_nodes: opts[:ignore_text_nodes]
218
242
  return nil if header_map.nil?
219
243
  data = self.parse_content html: opts[:html],
220
244
  selector: opts[:content_selector],
221
245
  header_map: header_map,
222
246
  first_row_header: opts[:first_row_header],
223
247
  column_parsers: opts[:column_parsers],
248
+ ignore_text_nodes: opts[:ignore_text_nodes],
224
249
  &filter
225
250
  {header_map: header_map, data: data}
226
251
  end
@@ -237,6 +262,8 @@ module AeEasy
237
262
  # @option opts [String] :content_selector Content row elements selector.
238
263
  # @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
239
264
  # Custom column parsers for advance data extraction.
265
+ # @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
266
+ # retriving cells and rows.
240
267
  #
241
268
  # @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
242
269
  # @yieldparam [Array] row Raw content row data.
@@ -253,7 +280,8 @@ module AeEasy
253
280
  header_selector: nil,
254
281
  header_key_label_map: {},
255
282
  content_selector: nil,
256
- column_parsers: {}
283
+ column_parsers: {},
284
+ ignore_text_nodes: true
257
285
  }.merge opts
258
286
  return nil if opts[:html].nil?
259
287
 
@@ -1,6 +1,6 @@
1
1
  module AeEasy
2
2
  module Text
3
3
  # Gem version
4
- VERSION = "0.0.1"
4
+ VERSION = "0.0.2"
5
5
  end
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ae_easy-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo Rosales
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-26 00:00:00.000000000 Z
11
+ date: 2019-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ae_easy-core
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: 0.1.2
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: 0.1.2
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement