ae_easy-text 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ae_easy-text.gemspec +1 -1
- data/doc/AeEasy.html +1 -1
- data/doc/AeEasy/Text.html +238 -116
- data/doc/_index.html +1 -1
- data/doc/file.README.html +1 -1
- data/doc/index.html +1 -1
- data/doc/top-level-namespace.html +1 -1
- data/lib/ae_easy/text.rb +47 -19
- data/lib/ae_easy/text/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10296214e4de01abc2d77f5a5549c9e4c883009a86915908283bb71dad3bec0b
|
4
|
+
data.tar.gz: 9db77d0892191a3dd5c170ffe2257c14559f0721075c88c1fc75ce6fa0f0b04e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6582399f34051ebcc5fa5192c22fc984329a04b1ffefb8c2db49ac5e782978b49c3fc4ea5c7fe6c0f0bbdffb4fff1799adb03f79ca92358b13409691e10b6b0d
|
7
|
+
data.tar.gz: 174c5533dd32772393fb7a8ba632fb4de8fe5ce8eba6e35d753abc906b295c1659d6cde0cf18e7496b6469a430e7f922335e23f7c3d5d724eba019163a6801f3
|
data/ae_easy-text.gemspec
CHANGED
@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.require_paths = ["lib"]
|
39
39
|
spec.required_ruby_version = '>= 2.2.2'
|
40
40
|
|
41
|
-
spec.add_dependency 'ae_easy-core', '>= 0'
|
41
|
+
spec.add_dependency 'ae_easy-core', '>= 0.1.2'
|
42
42
|
spec.add_development_dependency 'bundler', '>= 1.16.3'
|
43
43
|
spec.add_development_dependency 'rake', '>= 10.0'
|
44
44
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/doc/AeEasy.html
CHANGED
@@ -107,7 +107,7 @@
|
|
107
107
|
</div>
|
108
108
|
|
109
109
|
<div id="footer">
|
110
|
-
Generated on
|
110
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
111
111
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
112
112
|
0.9.18 (ruby-2.5.3).
|
113
113
|
</div>
|
data/doc/AeEasy/Text.html
CHANGED
@@ -108,7 +108,7 @@
|
|
108
108
|
|
109
109
|
</div>
|
110
110
|
</dt>
|
111
|
-
<dd><pre class="code"><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>0.0.
|
111
|
+
<dd><pre class="code"><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>0.0.2</span><span class='tstring_end'>"</span></span></pre></dd>
|
112
112
|
|
113
113
|
</dl>
|
114
114
|
|
@@ -326,7 +326,7 @@ using a header map to match columns.</p>
|
|
326
326
|
<li class="public ">
|
327
327
|
<span class="summary_signature">
|
328
328
|
|
329
|
-
<a href="#strip-class_method" title="strip (class method)">.<strong>strip</strong>(raw_text) ⇒ String<sup>?</sup> </a>
|
329
|
+
<a href="#strip-class_method" title="strip (class method)">.<strong>strip</strong>(raw_text, orig_encoding = 'ASCII') ⇒ String<sup>?</sup> </a>
|
330
330
|
|
331
331
|
|
332
332
|
|
@@ -341,7 +341,8 @@ using a header map to match columns.</p>
|
|
341
341
|
|
342
342
|
|
343
343
|
<span class="summary_desc"><div class='inline'>
|
344
|
-
<p>Strip a value
|
344
|
+
<p>Strip a value by trimming spaces, reducing secuential spaces into a
|
345
|
+
single space, decode HTML entities and change encoding to UTF-8.</p>
|
345
346
|
</div></span>
|
346
347
|
|
347
348
|
</li>
|
@@ -532,17 +533,19 @@ using a header map to match columns.</p>
|
|
532
533
|
<pre class="lines">
|
533
534
|
|
534
535
|
|
535
|
-
60
|
536
|
-
61
|
537
536
|
62
|
538
|
-
63
|
537
|
+
63
|
538
|
+
64
|
539
|
+
65
|
540
|
+
66</pre>
|
539
541
|
</td>
|
540
542
|
<td>
|
541
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
543
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 62</span>
|
542
544
|
|
543
545
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_default_parser'>default_parser</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='comma'>,</span> <span class='id identifier rubyid_data'>data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span>
|
544
|
-
<span class='
|
545
|
-
<span class='id identifier
|
546
|
+
<span class='kw'>return</span> <span class='kw'>if</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
547
|
+
<span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//i</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span> <span class='kw'>if</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//i</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_count'>count</span> <span class='op'>></span> <span class='int'>0</span>
|
548
|
+
<span class='id identifier rubyid_data'>data</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
546
549
|
<span class='kw'>end</span></pre>
|
547
550
|
</td>
|
548
551
|
</tr>
|
@@ -815,6 +818,22 @@ ignored.</p>
|
|
815
818
|
|
816
819
|
— <div class='inline'>
|
817
820
|
<p>Custom column parsers for advance data extraction.</p>
|
821
|
+
</div>
|
822
|
+
|
823
|
+
</li>
|
824
|
+
|
825
|
+
<li>
|
826
|
+
<span class="name">:ignore_text_nodes</span>
|
827
|
+
<span class="type">(<tt>Boolean</tt>)</span>
|
828
|
+
<span class="default">
|
829
|
+
|
830
|
+
— default:
|
831
|
+
<tt>true</tt>
|
832
|
+
|
833
|
+
</span>
|
834
|
+
|
835
|
+
— <div class='inline'>
|
836
|
+
<p>Ignore text nodes when retriving content cells and rows.</p>
|
818
837
|
</div>
|
819
838
|
|
820
839
|
</li>
|
@@ -917,11 +936,6 @@ ignored.</p>
|
|
917
936
|
<pre class="lines">
|
918
937
|
|
919
938
|
|
920
|
-
84
|
921
|
-
85
|
922
|
-
86
|
923
|
-
87
|
924
|
-
88
|
925
939
|
89
|
926
940
|
90
|
927
941
|
91
|
@@ -955,10 +969,21 @@ ignored.</p>
|
|
955
969
|
119
|
956
970
|
120
|
957
971
|
121
|
958
|
-
122
|
972
|
+
122
|
973
|
+
123
|
974
|
+
124
|
975
|
+
125
|
976
|
+
126
|
977
|
+
127
|
978
|
+
128
|
979
|
+
129
|
980
|
+
130
|
981
|
+
131
|
982
|
+
132
|
983
|
+
133</pre>
|
959
984
|
</td>
|
960
985
|
<td>
|
961
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
986
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 89</span>
|
962
987
|
|
963
988
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_content'>parse_content</span> <span class='id identifier rubyid_opts'>opts</span><span class='comma'>,</span> <span class='op'>&</span><span class='id identifier rubyid_filter'>filter</span>
|
964
989
|
<span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
|
@@ -966,7 +991,8 @@ ignored.</p>
|
|
966
991
|
<span class='label'>selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
967
992
|
<span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
|
968
993
|
<span class='label'>header_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
969
|
-
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
994
|
+
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
995
|
+
<span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
|
970
996
|
<span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
|
971
997
|
|
972
998
|
<span class='comment'># Setup config
|
@@ -975,10 +1001,13 @@ ignored.</p>
|
|
975
1001
|
<span class='id identifier rubyid_first'>first</span> <span class='op'>=</span> <span class='id identifier rubyid_first_row_header'>first_row_header</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
|
976
1002
|
<span class='id identifier rubyid_header_map'>header_map</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_map</span><span class='rbracket'>]</span>
|
977
1003
|
<span class='id identifier rubyid_column_parsers'>column_parsers</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_parsers</span><span class='rbracket'>]</span>
|
1004
|
+
<span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
|
978
1005
|
|
979
1006
|
<span class='comment'># Get and parse rows
|
980
1007
|
</span> <span class='id identifier rubyid_html_rows'>html_rows</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_css'>css</span><span class='lparen'>(</span><span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:selector</span><span class='rbracket'>]</span><span class='rparen'>)</span>
|
981
1008
|
<span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_row'>row</span><span class='op'>|</span>
|
1009
|
+
<span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>&&</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>==</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>text</span><span class='tstring_end'>'</span></span>
|
1010
|
+
|
982
1011
|
<span class='comment'># First row header validation
|
983
1012
|
</span> <span class='kw'>if</span> <span class='id identifier rubyid_first'>first</span> <span class='op'>&&</span> <span class='id identifier rubyid_first_row_header'>first_row_header</span>
|
984
1013
|
<span class='id identifier rubyid_first'>first</span> <span class='op'>=</span> <span class='kw'>false</span>
|
@@ -989,7 +1018,9 @@ ignored.</p>
|
|
989
1018
|
</span> <span class='id identifier rubyid_row_data'>row_data</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
990
1019
|
<span class='id identifier rubyid_header_map'>header_map</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_key'>key</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
|
991
1020
|
<span class='comment'># Parse column html with default or custom parser
|
992
|
-
</span> <span class='id identifier
|
1021
|
+
</span> <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span>
|
1022
|
+
<span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_select'>select</span><span class='lbrace'>{</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>!=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>text</span><span class='tstring_end'>'</span></span><span class='rbrace'>}</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span>
|
1023
|
+
<span class='id identifier rubyid_child_element'>child_element</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='lbracket'>[</span><span class='id identifier rubyid_index'>index</span><span class='rbracket'>]</span>
|
993
1024
|
<span class='id identifier rubyid_column_parsers'>column_parsers</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span>
|
994
1025
|
<span class='id identifier rubyid_default_parser'>default_parser</span><span class='lparen'>(</span><span class='id identifier rubyid_child_element'>child_element</span><span class='comma'>,</span> <span class='id identifier rubyid_row_data'>row_data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span><span class='rparen'>)</span> <span class='op'>:</span>
|
995
1026
|
<span class='id identifier rubyid_column_parsers'>column_parsers</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_call'>call</span><span class='lparen'>(</span><span class='id identifier rubyid_child_element'>child_element</span><span class='comma'>,</span> <span class='id identifier rubyid_row_data'>row_data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span><span class='rparen'>)</span>
|
@@ -1106,6 +1137,22 @@ ignored.</p>
|
|
1106
1137
|
— <div class='inline'>
|
1107
1138
|
<p>If true then selector first matching row will be used as header for
|
1108
1139
|
parsing.</p>
|
1140
|
+
</div>
|
1141
|
+
|
1142
|
+
</li>
|
1143
|
+
|
1144
|
+
<li>
|
1145
|
+
<span class="name">:ignore_text_nodes</span>
|
1146
|
+
<span class="type">(<tt>Boolean</tt>)</span>
|
1147
|
+
<span class="default">
|
1148
|
+
|
1149
|
+
— default:
|
1150
|
+
<tt>true</tt>
|
1151
|
+
|
1152
|
+
</span>
|
1153
|
+
|
1154
|
+
— <div class='inline'>
|
1155
|
+
<p>Ignore text nodes when retriving header cells and rows.</p>
|
1109
1156
|
</div>
|
1110
1157
|
|
1111
1158
|
</li>
|
@@ -1138,20 +1185,6 @@ parsing.</p>
|
|
1138
1185
|
<pre class="lines">
|
1139
1186
|
|
1140
1187
|
|
1141
|
-
152
|
1142
|
-
153
|
1143
|
-
154
|
1144
|
-
155
|
1145
|
-
156
|
1146
|
-
157
|
1147
|
-
158
|
1148
|
-
159
|
1149
|
-
160
|
1150
|
-
161
|
1151
|
-
162
|
1152
|
-
163
|
1153
|
-
164
|
1154
|
-
165
|
1155
1188
|
166
|
1156
1189
|
167
|
1157
1190
|
168
|
@@ -1166,21 +1199,43 @@ parsing.</p>
|
|
1166
1199
|
177
|
1167
1200
|
178
|
1168
1201
|
179
|
1169
|
-
180
|
1202
|
+
180
|
1203
|
+
181
|
1204
|
+
182
|
1205
|
+
183
|
1206
|
+
184
|
1207
|
+
185
|
1208
|
+
186
|
1209
|
+
187
|
1210
|
+
188
|
1211
|
+
189
|
1212
|
+
190
|
1213
|
+
191
|
1214
|
+
192
|
1215
|
+
193
|
1216
|
+
194
|
1217
|
+
195
|
1218
|
+
196
|
1219
|
+
197
|
1220
|
+
198
|
1221
|
+
199
|
1222
|
+
200</pre>
|
1170
1223
|
</td>
|
1171
1224
|
<td>
|
1172
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
1225
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 166</span>
|
1173
1226
|
|
1174
1227
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_header_map'>parse_header_map</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
1175
1228
|
<span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
|
1176
1229
|
<span class='label'>html:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1177
1230
|
<span class='label'>selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1178
1231
|
<span class='label'>column_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1179
|
-
<span class='label'>first_row_header:</span> <span class='kw'>false</span>
|
1232
|
+
<span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
|
1233
|
+
<span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
|
1180
1234
|
<span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
|
1181
1235
|
|
1182
1236
|
<span class='comment'># Setup config
|
1183
1237
|
</span> <span class='id identifier rubyid_dictionary'>dictionary</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_key_label_map</span><span class='rbracket'>]</span>
|
1238
|
+
<span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
|
1184
1239
|
<span class='id identifier rubyid_data'>data</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='rbracket'>]</span>
|
1185
1240
|
<span class='id identifier rubyid_column_map'>column_map</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
1186
1241
|
|
@@ -1189,8 +1244,12 @@ parsing.</p>
|
|
1189
1244
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1190
1245
|
<span class='id identifier rubyid_html_rows'>html_rows</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_first'>first</span><span class='rbracket'>]</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
|
1191
1246
|
<span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_row'>row</span><span class='op'>|</span>
|
1247
|
+
<span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>&&</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>==</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>text</span><span class='tstring_end'>'</span></span>
|
1248
|
+
|
1192
1249
|
<span class='id identifier rubyid_column_map'>column_map</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
1193
|
-
<span class='id identifier
|
1250
|
+
<span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span>
|
1251
|
+
<span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_select'>select</span><span class='lbrace'>{</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>!=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>text</span><span class='tstring_end'>'</span></span><span class='rbrace'>}</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span>
|
1252
|
+
<span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_each_with_index'>each_with_index</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
|
1194
1253
|
<span class='comment'># Parse and map column header
|
1195
1254
|
</span> <span class='id identifier rubyid_column_key'>column_key</span> <span class='op'>=</span> <span class='id identifier rubyid_translate_label_to_key'>translate_label_to_key</span> <span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_dictionary'>dictionary</span>
|
1196
1255
|
<span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_column_key'>column_key</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
@@ -1336,6 +1395,22 @@ parsing.</p>
|
|
1336
1395
|
|
1337
1396
|
— <div class='inline'>
|
1338
1397
|
<p>Custom column parsers for advance data extraction.</p>
|
1398
|
+
</div>
|
1399
|
+
|
1400
|
+
</li>
|
1401
|
+
|
1402
|
+
<li>
|
1403
|
+
<span class="name">:ignore_text_nodes</span>
|
1404
|
+
<span class="type">(<tt>Boolean</tt>)</span>
|
1405
|
+
<span class="default">
|
1406
|
+
|
1407
|
+
— default:
|
1408
|
+
<tt>true</tt>
|
1409
|
+
|
1410
|
+
</span>
|
1411
|
+
|
1412
|
+
— <div class='inline'>
|
1413
|
+
<p>Ignore text nodes when retriving cells and rows.</p>
|
1339
1414
|
</div>
|
1340
1415
|
|
1341
1416
|
</li>
|
@@ -1443,32 +1518,35 @@ parsing.</p>
|
|
1443
1518
|
<pre class="lines">
|
1444
1519
|
|
1445
1520
|
|
1446
|
-
|
1447
|
-
|
1448
|
-
|
1449
|
-
|
1450
|
-
|
1451
|
-
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
|
1468
|
-
|
1521
|
+
226
|
1522
|
+
227
|
1523
|
+
228
|
1524
|
+
229
|
1525
|
+
230
|
1526
|
+
231
|
1527
|
+
232
|
1528
|
+
233
|
1529
|
+
234
|
1530
|
+
235
|
1531
|
+
236
|
1532
|
+
237
|
1533
|
+
238
|
1534
|
+
239
|
1535
|
+
240
|
1536
|
+
241
|
1537
|
+
242
|
1538
|
+
243
|
1539
|
+
244
|
1540
|
+
245
|
1541
|
+
246
|
1542
|
+
247
|
1543
|
+
248
|
1544
|
+
249
|
1545
|
+
250
|
1546
|
+
251</pre>
|
1469
1547
|
</td>
|
1470
1548
|
<td>
|
1471
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
1549
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 226</span>
|
1472
1550
|
|
1473
1551
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_table'>parse_table</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span> <span class='op'>&</span><span class='id identifier rubyid_filter'>filter</span>
|
1474
1552
|
<span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
|
@@ -1477,19 +1555,22 @@ parsing.</p>
|
|
1477
1555
|
<span class='label'>header_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1478
1556
|
<span class='label'>content_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1479
1557
|
<span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
|
1480
|
-
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
1558
|
+
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1559
|
+
<span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
|
1481
1560
|
<span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
|
1482
1561
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1483
1562
|
<span class='id identifier rubyid_header_map'>header_map</span> <span class='op'>=</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_header_map'>parse_header_map</span> <span class='label'>html:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1484
1563
|
<span class='label'>selector:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_selector</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1485
1564
|
<span class='label'>column_key_label_map:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_key_label_map</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1486
|
-
<span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
|
1565
|
+
<span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1566
|
+
<span class='label'>ignore_text_nodes:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
|
1487
1567
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1488
1568
|
<span class='id identifier rubyid_data'>data</span> <span class='op'>=</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_content'>parse_content</span> <span class='label'>html:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1489
1569
|
<span class='label'>selector:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:content_selector</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1490
1570
|
<span class='label'>header_map:</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='comma'>,</span>
|
1491
1571
|
<span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1492
1572
|
<span class='label'>column_parsers:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_parsers</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1573
|
+
<span class='label'>ignore_text_nodes:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1493
1574
|
<span class='op'>&</span><span class='id identifier rubyid_filter'>filter</span>
|
1494
1575
|
<span class='lbrace'>{</span><span class='label'>header_map:</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='comma'>,</span> <span class='label'>data:</span> <span class='id identifier rubyid_data'>data</span><span class='rbrace'>}</span>
|
1495
1576
|
<span class='kw'>end</span></pre>
|
@@ -1625,6 +1706,22 @@ parsing.</p>
|
|
1625
1706
|
|
1626
1707
|
— <div class='inline'>
|
1627
1708
|
<p>Custom column parsers for advance data extraction.</p>
|
1709
|
+
</div>
|
1710
|
+
|
1711
|
+
</li>
|
1712
|
+
|
1713
|
+
<li>
|
1714
|
+
<span class="name">:ignore_text_nodes</span>
|
1715
|
+
<span class="type">(<tt>Boolean</tt>)</span>
|
1716
|
+
<span class="default">
|
1717
|
+
|
1718
|
+
— default:
|
1719
|
+
<tt>true</tt>
|
1720
|
+
|
1721
|
+
</span>
|
1722
|
+
|
1723
|
+
— <div class='inline'>
|
1724
|
+
<p>Ignore text nodes when retriving cells and rows.</p>
|
1628
1725
|
</div>
|
1629
1726
|
|
1630
1727
|
</li>
|
@@ -1732,42 +1829,43 @@ parsing.</p>
|
|
1732
1829
|
<pre class="lines">
|
1733
1830
|
|
1734
1831
|
|
1735
|
-
249
|
1736
|
-
250
|
1737
|
-
251
|
1738
|
-
252
|
1739
|
-
253
|
1740
|
-
254
|
1741
|
-
255
|
1742
|
-
256
|
1743
|
-
257
|
1744
|
-
258
|
1745
|
-
259
|
1746
|
-
260
|
1747
|
-
261
|
1748
|
-
262
|
1749
|
-
263
|
1750
|
-
264
|
1751
|
-
265
|
1752
|
-
266
|
1753
|
-
267
|
1754
|
-
268
|
1755
|
-
269
|
1756
|
-
270
|
1757
|
-
271
|
1758
|
-
272
|
1759
|
-
273
|
1760
|
-
274
|
1761
|
-
275
|
1762
1832
|
276
|
1763
1833
|
277
|
1764
1834
|
278
|
1765
1835
|
279
|
1766
1836
|
280
|
1767
|
-
281
|
1837
|
+
281
|
1838
|
+
282
|
1839
|
+
283
|
1840
|
+
284
|
1841
|
+
285
|
1842
|
+
286
|
1843
|
+
287
|
1844
|
+
288
|
1845
|
+
289
|
1846
|
+
290
|
1847
|
+
291
|
1848
|
+
292
|
1849
|
+
293
|
1850
|
+
294
|
1851
|
+
295
|
1852
|
+
296
|
1853
|
+
297
|
1854
|
+
298
|
1855
|
+
299
|
1856
|
+
300
|
1857
|
+
301
|
1858
|
+
302
|
1859
|
+
303
|
1860
|
+
304
|
1861
|
+
305
|
1862
|
+
306
|
1863
|
+
307
|
1864
|
+
308
|
1865
|
+
309</pre>
|
1768
1866
|
</td>
|
1769
1867
|
<td>
|
1770
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
1868
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 276</span>
|
1771
1869
|
|
1772
1870
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_vertical_table'>parse_vertical_table</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span> <span class='op'>&</span><span class='id identifier rubyid_filter'>filter</span>
|
1773
1871
|
<span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
|
@@ -1776,7 +1874,8 @@ parsing.</p>
|
|
1776
1874
|
<span class='label'>header_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1777
1875
|
<span class='label'>header_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1778
1876
|
<span class='label'>content_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1779
|
-
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
1877
|
+
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1878
|
+
<span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
|
1780
1879
|
<span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
|
1781
1880
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1782
1881
|
|
@@ -1810,7 +1909,7 @@ parsing.</p>
|
|
1810
1909
|
<div class="method_details ">
|
1811
1910
|
<h3 class="signature " id="strip-class_method">
|
1812
1911
|
|
1813
|
-
.<strong>strip</strong>(raw_text) ⇒ <tt>String</tt><sup>?</sup>
|
1912
|
+
.<strong>strip</strong>(raw_text, orig_encoding = 'ASCII') ⇒ <tt>String</tt><sup>?</sup>
|
1814
1913
|
|
1815
1914
|
|
1816
1915
|
|
@@ -1819,7 +1918,10 @@ parsing.</p>
|
|
1819
1918
|
</h3><div class="docstring">
|
1820
1919
|
<div class="discussion">
|
1821
1920
|
|
1822
|
-
<p>Strip a value
|
1921
|
+
<p>Strip a value by trimming spaces, reducing secuential spaces into a</p>
|
1922
|
+
|
1923
|
+
<pre class="code ruby"><code class="ruby">single space, decode HTML entities and change encoding to UTF-8.
|
1924
|
+
</code></pre>
|
1823
1925
|
|
1824
1926
|
|
1825
1927
|
</div>
|
@@ -1840,6 +1942,24 @@ parsing.</p>
|
|
1840
1942
|
—
|
1841
1943
|
<div class='inline'>
|
1842
1944
|
<p>Text to strip.</p>
|
1945
|
+
</div>
|
1946
|
+
|
1947
|
+
</li>
|
1948
|
+
|
1949
|
+
<li>
|
1950
|
+
|
1951
|
+
<span class='name'>orig_encoding</span>
|
1952
|
+
|
1953
|
+
|
1954
|
+
<span class='type'>(<tt>String</tt>)</span>
|
1955
|
+
|
1956
|
+
|
1957
|
+
<em class="default">(defaults to: <tt>'ASCII'</tt>)</em>
|
1958
|
+
|
1959
|
+
|
1960
|
+
—
|
1961
|
+
<div class='inline'>
|
1962
|
+
<p>Text original encoding.</p>
|
1843
1963
|
</div>
|
1844
1964
|
|
1845
1965
|
</li>
|
@@ -1871,8 +1991,6 @@ parsing.</p>
|
|
1871
1991
|
<pre class="lines">
|
1872
1992
|
|
1873
1993
|
|
1874
|
-
42
|
1875
|
-
43
|
1876
1994
|
44
|
1877
1995
|
45
|
1878
1996
|
46
|
@@ -1882,21 +2000,23 @@ parsing.</p>
|
|
1882
2000
|
50
|
1883
2001
|
51
|
1884
2002
|
52
|
1885
|
-
53
|
2003
|
+
53
|
2004
|
+
54
|
2005
|
+
55</pre>
|
1886
2006
|
</td>
|
1887
2007
|
<td>
|
1888
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
2008
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 44</span>
|
1889
2009
|
|
1890
|
-
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_raw_text'>raw_text</span>
|
2010
|
+
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='comma'>,</span> <span class='id identifier rubyid_orig_encoding'>orig_encoding</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>ASCII</span><span class='tstring_end'>'</span></span>
|
1891
2011
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1892
2012
|
<span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span> <span class='kw'>unless</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_is_a?'>is_a?</span> <span class='const'>String</span>
|
1893
2013
|
<span class='id identifier rubyid_regex'>regex</span> <span class='op'>=</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>(\s|\u3000|\u00a0)+</span><span class='regexp_end'>/</span></span>
|
1894
2014
|
<span class='id identifier rubyid_good_encoding'>good_encoding</span> <span class='op'>=</span> <span class='lparen'>(</span><span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=~</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\u3000</span><span class='regexp_end'>/</span></span> <span class='op'>||</span> <span class='kw'>true</span><span class='rparen'>)</span> <span class='kw'>rescue</span> <span class='kw'>false</span>
|
1895
2015
|
<span class='kw'>unless</span> <span class='id identifier rubyid_good_encoding'>good_encoding</span>
|
1896
|
-
<span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_force_encoding'>force_encoding</span><span class='lparen'>(</span><span class='
|
2016
|
+
<span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_force_encoding'>force_encoding</span><span class='lparen'>(</span><span class='id identifier rubyid_orig_encoding'>orig_encoding</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_encode'>encode</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>UTF-8</span><span class='tstring_end'>'</span></span><span class='comma'>,</span> <span class='label'>invalid:</span> <span class='symbol'>:replace</span><span class='comma'>,</span> <span class='label'>undef:</span> <span class='symbol'>:replace</span><span class='rparen'>)</span>
|
1897
2017
|
<span class='id identifier rubyid_regex'>regex</span> <span class='op'>=</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>(\s|\u3000|\u00a0|\u00c2\u00a0)+</span><span class='regexp_end'>/</span></span>
|
1898
2018
|
<span class='kw'>end</span>
|
1899
|
-
<span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='
|
2019
|
+
<span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='id identifier rubyid_regex'>regex</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'> </span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span>
|
1900
2020
|
<span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span> <span class='kw'>nil</span> <span class='op'>:</span> <span class='id identifier rubyid_decode_html'>decode_html</span><span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span><span class='rparen'>)</span>
|
1901
2021
|
<span class='kw'>end</span></pre>
|
1902
2022
|
</td>
|
@@ -1984,25 +2104,27 @@ parsing.</p>
|
|
1984
2104
|
<pre class="lines">
|
1985
2105
|
|
1986
2106
|
|
1987
|
-
|
1988
|
-
|
1989
|
-
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
1993
|
-
|
1994
|
-
|
2107
|
+
142
|
2108
|
+
143
|
2109
|
+
144
|
2110
|
+
145
|
2111
|
+
146
|
2112
|
+
147
|
2113
|
+
148
|
2114
|
+
149
|
2115
|
+
150</pre>
|
1995
2116
|
</td>
|
1996
2117
|
<td>
|
1997
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
2118
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 142</span>
|
1998
2119
|
|
1999
2120
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_translate_label_to_key'>translate_label_to_key</span> <span class='id identifier rubyid_element'>element</span><span class='comma'>,</span> <span class='id identifier rubyid_label_map'>label_map</span>
|
2000
|
-
<span class='
|
2001
|
-
<span class='id identifier
|
2002
|
-
<span class='id identifier
|
2121
|
+
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
2122
|
+
<span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//i</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span> <span class='kw'>if</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//i</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_count'>count</span> <span class='op'>></span> <span class='int'>0</span>
|
2123
|
+
<span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
2124
|
+
<span class='id identifier rubyid_key_pair'>key_pair</span> <span class='op'>=</span> <span class='id identifier rubyid_label_map'>label_map</span><span class='period'>.</span><span class='id identifier rubyid_find'>find</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_k'>k</span><span class='comma'>,</span><span class='id identifier rubyid_v'>v</span><span class='op'>|</span>
|
2003
2125
|
<span class='id identifier rubyid_v'>v</span><span class='period'>.</span><span class='id identifier rubyid_is_a?'>is_a?</span><span class='lparen'>(</span><span class='const'>Regexp</span><span class='rparen'>)</span> <span class='op'>?</span> <span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span> <span class='op'>=~</span> <span class='id identifier rubyid_v'>v</span><span class='rparen'>)</span> <span class='op'>:</span> <span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span> <span class='op'>==</span> <span class='id identifier rubyid_v'>v</span><span class='rparen'>)</span>
|
2004
|
-
<span class='kw'>end</span
|
2005
|
-
<span class='id identifier rubyid_key'>key</span>
|
2126
|
+
<span class='kw'>end</span>
|
2127
|
+
<span class='id identifier rubyid_key'>key</span> <span class='op'>=</span> <span class='id identifier rubyid_key_pair'>key_pair</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span> <span class='kw'>nil</span> <span class='op'>:</span> <span class='id identifier rubyid_key_pair'>key_pair</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
|
2006
2128
|
<span class='kw'>end</span></pre>
|
2007
2129
|
</td>
|
2008
2130
|
</tr>
|
@@ -2014,7 +2136,7 @@ parsing.</p>
|
|
2014
2136
|
</div>
|
2015
2137
|
|
2016
2138
|
<div id="footer">
|
2017
|
-
Generated on
|
2139
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
2018
2140
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
2019
2141
|
0.9.18 (ruby-2.5.3).
|
2020
2142
|
</div>
|
data/doc/_index.html
CHANGED
@@ -112,7 +112,7 @@
|
|
112
112
|
</div>
|
113
113
|
|
114
114
|
<div id="footer">
|
115
|
-
Generated on
|
115
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
116
116
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
117
117
|
0.9.18 (ruby-2.5.3).
|
118
118
|
</div>
|
data/doc/file.README.html
CHANGED
@@ -81,7 +81,7 @@ href="http://rubydoc.org/gems/ae_easy-text/frames">here</a>.</p>
|
|
81
81
|
</div></div>
|
82
82
|
|
83
83
|
<div id="footer">
|
84
|
-
Generated on
|
84
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
85
85
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
86
86
|
0.9.18 (ruby-2.5.3).
|
87
87
|
</div>
|
data/doc/index.html
CHANGED
@@ -81,7 +81,7 @@ href="http://rubydoc.org/gems/ae_easy-text/frames">here</a>.</p>
|
|
81
81
|
</div></div>
|
82
82
|
|
83
83
|
<div id="footer">
|
84
|
-
Generated on
|
84
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
85
85
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
86
86
|
0.9.18 (ruby-2.5.3).
|
87
87
|
</div>
|
@@ -100,7 +100,7 @@
|
|
100
100
|
</div>
|
101
101
|
|
102
102
|
<div id="footer">
|
103
|
-
Generated on
|
103
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
104
104
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
105
105
|
0.9.18 (ruby-2.5.3).
|
106
106
|
</div>
|
data/lib/ae_easy/text.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'cgi'
|
2
2
|
require 'json'
|
3
3
|
require 'digest/sha1'
|
4
|
-
require 'ae_easy
|
4
|
+
require 'ae_easy/core'
|
5
5
|
require 'ae_easy/text/version'
|
6
6
|
|
7
7
|
module AeEasy
|
@@ -34,21 +34,23 @@ module AeEasy
|
|
34
34
|
CGI.unescapeHTML text
|
35
35
|
end
|
36
36
|
|
37
|
-
# Strip a value
|
37
|
+
# Strip a value by trimming spaces, reducing secuential spaces into a
|
38
|
+
# single space, decode HTML entities and change encoding to UTF-8.
|
38
39
|
#
|
39
40
|
# @param [String,Object,nil] raw_text Text to strip.
|
41
|
+
# @param [String] orig_encoding Text original encoding.
|
40
42
|
#
|
41
43
|
# @return [String,nil] `nil` when +raw_text+ is nil, else `String`.
|
42
|
-
def self.strip raw_text
|
44
|
+
def self.strip raw_text, orig_encoding = 'ASCII'
|
43
45
|
return nil if raw_text.nil?
|
44
46
|
raw_text = raw_text.to_s unless raw_text.is_a? String
|
45
47
|
regex = /(\s|\u3000|\u00a0)+/
|
46
48
|
good_encoding = (raw_text =~ /\u3000/ || true) rescue false
|
47
49
|
unless good_encoding
|
48
|
-
raw_text = raw_text.force_encoding(
|
50
|
+
raw_text = raw_text.force_encoding(orig_encoding).encode('UTF-8', invalid: :replace, undef: :replace)
|
49
51
|
regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
|
50
52
|
end
|
51
|
-
text = raw_text
|
53
|
+
text = raw_text.gsub(regex, ' ').strip
|
52
54
|
text.nil? ? nil : decode_html(text)
|
53
55
|
end
|
54
56
|
|
@@ -58,8 +60,9 @@ module AeEasy
|
|
58
60
|
# @param [Hash] data Data hash to save parsed data into.
|
59
61
|
# @param [String,Symbol] key Header column key being parsed.
|
60
62
|
def self.default_parser cell_element, data, key
|
61
|
-
cell_element
|
62
|
-
|
63
|
+
return if cell_element.nil?
|
64
|
+
cell_element.search('//i').remove if cell_element.search('//i').count > 0
|
65
|
+
data[key] = strip cell_element.text
|
63
66
|
end
|
64
67
|
|
65
68
|
# Parse row data matching a selector using a header map to translate
|
@@ -74,6 +77,8 @@ module AeEasy
|
|
74
77
|
# index dictionary.
|
75
78
|
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
76
79
|
# Custom column parsers for advance data extraction.
|
80
|
+
# @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
|
81
|
+
# retriving content cells and rows.
|
77
82
|
#
|
78
83
|
# @yieldparam [Hash{Symbol,String => Object}] data Parsed row data.
|
79
84
|
# @yieldparam [Array] row Raw row data.
|
@@ -87,7 +92,8 @@ module AeEasy
|
|
87
92
|
selector: nil,
|
88
93
|
first_row_header: false,
|
89
94
|
header_map: {},
|
90
|
-
column_parsers: {}
|
95
|
+
column_parsers: {},
|
96
|
+
ignore_text_nodes: true
|
91
97
|
}.merge opts
|
92
98
|
|
93
99
|
# Setup config
|
@@ -96,10 +102,13 @@ module AeEasy
|
|
96
102
|
first = first_row_header = opts[:first_row_header]
|
97
103
|
header_map = opts[:header_map]
|
98
104
|
column_parsers = opts[:column_parsers]
|
105
|
+
ignore_text_nodes = opts[:ignore_text_nodes]
|
99
106
|
|
100
107
|
# Get and parse rows
|
101
108
|
html_rows = opts[:html].css(opts[:selector])
|
102
109
|
html_rows.each do |row|
|
110
|
+
next if ignore_text_nodes && row.name == 'text'
|
111
|
+
|
103
112
|
# First row header validation
|
104
113
|
if first && first_row_header
|
105
114
|
first = false
|
@@ -110,7 +119,9 @@ module AeEasy
|
|
110
119
|
row_data = {}
|
111
120
|
header_map.each do |key, index|
|
112
121
|
# Parse column html with default or custom parser
|
113
|
-
|
122
|
+
children = row.children
|
123
|
+
children = children.select{|i|i.name != 'text'} if ignore_text_nodes
|
124
|
+
child_element = children[index]
|
114
125
|
column_parsers[key].nil? ?
|
115
126
|
default_parser(child_element, row_data, key) :
|
116
127
|
column_parsers[key].call(child_element, row_data, key)
|
@@ -129,12 +140,13 @@ module AeEasy
|
|
129
140
|
#
|
130
141
|
# @return [Symbol,String] Translated key.
|
131
142
|
def self.translate_label_to_key element, label_map
|
132
|
-
element
|
133
|
-
|
134
|
-
|
143
|
+
return nil if element.nil?
|
144
|
+
element.search('//i').remove if element.search('//i').count > 0
|
145
|
+
text = strip element.text
|
146
|
+
key_pair = label_map.find do |k,v|
|
135
147
|
v.is_a?(Regexp) ? (text =~ v) : (text == v)
|
136
|
-
end
|
137
|
-
key
|
148
|
+
end
|
149
|
+
key = key_pair.nil? ? nil : key_pair[0]
|
138
150
|
end
|
139
151
|
|
140
152
|
# Parse header from selector and create a header map to match a column key
|
@@ -147,6 +159,8 @@ module AeEasy
|
|
147
159
|
# Key vs. label dictionary.
|
148
160
|
# @option opts [Boolean] :first_row_header (false) If true then selector
|
149
161
|
# first matching row will be used as header for parsing.
|
162
|
+
# @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
|
163
|
+
# retriving header cells and rows.
|
150
164
|
#
|
151
165
|
# @return [Hash{Symbol,String => Integer},nil] Key vs. column index map.
|
152
166
|
def self.parse_header_map opts = {}
|
@@ -154,11 +168,13 @@ module AeEasy
|
|
154
168
|
html: nil,
|
155
169
|
selector: nil,
|
156
170
|
column_key_label_map: {},
|
157
|
-
first_row_header: false
|
171
|
+
first_row_header: false,
|
172
|
+
ignore_text_nodes: true
|
158
173
|
}.merge opts
|
159
174
|
|
160
175
|
# Setup config
|
161
176
|
dictionary = opts[:column_key_label_map]
|
177
|
+
ignore_text_nodes = opts[:ignore_text_nodes]
|
162
178
|
data = []
|
163
179
|
column_map = nil
|
164
180
|
|
@@ -167,8 +183,12 @@ module AeEasy
|
|
167
183
|
return nil if html_rows.nil?
|
168
184
|
html_rows = [html_rows.first] if opts[:first_row_header]
|
169
185
|
html_rows.each do |row|
|
186
|
+
next if ignore_text_nodes && row.name == 'text'
|
187
|
+
|
170
188
|
column_map = {}
|
171
|
-
row.children
|
189
|
+
children = row.children
|
190
|
+
children = children.select{|i|i.name != 'text'} if ignore_text_nodes
|
191
|
+
children.each_with_index do |col, index|
|
172
192
|
# Parse and map column header
|
173
193
|
column_key = translate_label_to_key col, dictionary
|
174
194
|
next if column_key.nil?
|
@@ -192,6 +212,8 @@ module AeEasy
|
|
192
212
|
# first matching row will be used as header for parsing.
|
193
213
|
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
194
214
|
# Custom column parsers for advance data extraction.
|
215
|
+
# @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
|
216
|
+
# retriving cells and rows.
|
195
217
|
#
|
196
218
|
# @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
|
197
219
|
# @yieldparam [Array] row Raw content row data.
|
@@ -208,19 +230,22 @@ module AeEasy
|
|
208
230
|
header_key_label_map: {},
|
209
231
|
content_selector: nil,
|
210
232
|
first_row_header: false,
|
211
|
-
column_parsers: {}
|
233
|
+
column_parsers: {},
|
234
|
+
ignore_text_nodes: true
|
212
235
|
}.merge opts
|
213
236
|
return nil if opts[:html].nil?
|
214
237
|
header_map = self.parse_header_map html: opts[:html],
|
215
238
|
selector: opts[:header_selector],
|
216
239
|
column_key_label_map: opts[:header_key_label_map],
|
217
|
-
first_row_header: opts[:first_row_header]
|
240
|
+
first_row_header: opts[:first_row_header],
|
241
|
+
ignore_text_nodes: opts[:ignore_text_nodes]
|
218
242
|
return nil if header_map.nil?
|
219
243
|
data = self.parse_content html: opts[:html],
|
220
244
|
selector: opts[:content_selector],
|
221
245
|
header_map: header_map,
|
222
246
|
first_row_header: opts[:first_row_header],
|
223
247
|
column_parsers: opts[:column_parsers],
|
248
|
+
ignore_text_nodes: opts[:ignore_text_nodes],
|
224
249
|
&filter
|
225
250
|
{header_map: header_map, data: data}
|
226
251
|
end
|
@@ -237,6 +262,8 @@ module AeEasy
|
|
237
262
|
# @option opts [String] :content_selector Content row elements selector.
|
238
263
|
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
239
264
|
# Custom column parsers for advance data extraction.
|
265
|
+
# @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
|
266
|
+
# retriving cells and rows.
|
240
267
|
#
|
241
268
|
# @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
|
242
269
|
# @yieldparam [Array] row Raw content row data.
|
@@ -253,7 +280,8 @@ module AeEasy
|
|
253
280
|
header_selector: nil,
|
254
281
|
header_key_label_map: {},
|
255
282
|
content_selector: nil,
|
256
|
-
column_parsers: {}
|
283
|
+
column_parsers: {},
|
284
|
+
ignore_text_nodes: true
|
257
285
|
}.merge opts
|
258
286
|
return nil if opts[:html].nil?
|
259
287
|
|
data/lib/ae_easy/text/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ae_easy-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo Rosales
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ae_easy-core
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.1.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.1.2
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|