ae_easy-text 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ae_easy-text.gemspec +1 -1
- data/doc/AeEasy.html +1 -1
- data/doc/AeEasy/Text.html +238 -116
- data/doc/_index.html +1 -1
- data/doc/file.README.html +1 -1
- data/doc/index.html +1 -1
- data/doc/top-level-namespace.html +1 -1
- data/lib/ae_easy/text.rb +47 -19
- data/lib/ae_easy/text/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10296214e4de01abc2d77f5a5549c9e4c883009a86915908283bb71dad3bec0b
|
4
|
+
data.tar.gz: 9db77d0892191a3dd5c170ffe2257c14559f0721075c88c1fc75ce6fa0f0b04e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6582399f34051ebcc5fa5192c22fc984329a04b1ffefb8c2db49ac5e782978b49c3fc4ea5c7fe6c0f0bbdffb4fff1799adb03f79ca92358b13409691e10b6b0d
|
7
|
+
data.tar.gz: 174c5533dd32772393fb7a8ba632fb4de8fe5ce8eba6e35d753abc906b295c1659d6cde0cf18e7496b6469a430e7f922335e23f7c3d5d724eba019163a6801f3
|
data/ae_easy-text.gemspec
CHANGED
@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.require_paths = ["lib"]
|
39
39
|
spec.required_ruby_version = '>= 2.2.2'
|
40
40
|
|
41
|
-
spec.add_dependency 'ae_easy-core', '>= 0'
|
41
|
+
spec.add_dependency 'ae_easy-core', '>= 0.1.2'
|
42
42
|
spec.add_development_dependency 'bundler', '>= 1.16.3'
|
43
43
|
spec.add_development_dependency 'rake', '>= 10.0'
|
44
44
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/doc/AeEasy.html
CHANGED
@@ -107,7 +107,7 @@
|
|
107
107
|
</div>
|
108
108
|
|
109
109
|
<div id="footer">
|
110
|
-
Generated on
|
110
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
111
111
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
112
112
|
0.9.18 (ruby-2.5.3).
|
113
113
|
</div>
|
data/doc/AeEasy/Text.html
CHANGED
@@ -108,7 +108,7 @@
|
|
108
108
|
|
109
109
|
</div>
|
110
110
|
</dt>
|
111
|
-
<dd><pre class="code"><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>0.0.
|
111
|
+
<dd><pre class="code"><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>0.0.2</span><span class='tstring_end'>"</span></span></pre></dd>
|
112
112
|
|
113
113
|
</dl>
|
114
114
|
|
@@ -326,7 +326,7 @@ using a header map to match columns.</p>
|
|
326
326
|
<li class="public ">
|
327
327
|
<span class="summary_signature">
|
328
328
|
|
329
|
-
<a href="#strip-class_method" title="strip (class method)">.<strong>strip</strong>(raw_text) ⇒ String<sup>?</sup> </a>
|
329
|
+
<a href="#strip-class_method" title="strip (class method)">.<strong>strip</strong>(raw_text, orig_encoding = 'ASCII') ⇒ String<sup>?</sup> </a>
|
330
330
|
|
331
331
|
|
332
332
|
|
@@ -341,7 +341,8 @@ using a header map to match columns.</p>
|
|
341
341
|
|
342
342
|
|
343
343
|
<span class="summary_desc"><div class='inline'>
|
344
|
-
<p>Strip a value
|
344
|
+
<p>Strip a value by trimming spaces, reducing secuential spaces into a
|
345
|
+
single space, decode HTML entities and change encoding to UTF-8.</p>
|
345
346
|
</div></span>
|
346
347
|
|
347
348
|
</li>
|
@@ -532,17 +533,19 @@ using a header map to match columns.</p>
|
|
532
533
|
<pre class="lines">
|
533
534
|
|
534
535
|
|
535
|
-
60
|
536
|
-
61
|
537
536
|
62
|
538
|
-
63
|
537
|
+
63
|
538
|
+
64
|
539
|
+
65
|
540
|
+
66</pre>
|
539
541
|
</td>
|
540
542
|
<td>
|
541
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
543
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 62</span>
|
542
544
|
|
543
545
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_default_parser'>default_parser</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='comma'>,</span> <span class='id identifier rubyid_data'>data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span>
|
544
|
-
<span class='
|
545
|
-
<span class='id identifier
|
546
|
+
<span class='kw'>return</span> <span class='kw'>if</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
547
|
+
<span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//i</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span> <span class='kw'>if</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//i</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_count'>count</span> <span class='op'>></span> <span class='int'>0</span>
|
548
|
+
<span class='id identifier rubyid_data'>data</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_cell_element'>cell_element</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
546
549
|
<span class='kw'>end</span></pre>
|
547
550
|
</td>
|
548
551
|
</tr>
|
@@ -815,6 +818,22 @@ ignored.</p>
|
|
815
818
|
|
816
819
|
— <div class='inline'>
|
817
820
|
<p>Custom column parsers for advance data extraction.</p>
|
821
|
+
</div>
|
822
|
+
|
823
|
+
</li>
|
824
|
+
|
825
|
+
<li>
|
826
|
+
<span class="name">:ignore_text_nodes</span>
|
827
|
+
<span class="type">(<tt>Boolean</tt>)</span>
|
828
|
+
<span class="default">
|
829
|
+
|
830
|
+
— default:
|
831
|
+
<tt>true</tt>
|
832
|
+
|
833
|
+
</span>
|
834
|
+
|
835
|
+
— <div class='inline'>
|
836
|
+
<p>Ignore text nodes when retriving content cells and rows.</p>
|
818
837
|
</div>
|
819
838
|
|
820
839
|
</li>
|
@@ -917,11 +936,6 @@ ignored.</p>
|
|
917
936
|
<pre class="lines">
|
918
937
|
|
919
938
|
|
920
|
-
84
|
921
|
-
85
|
922
|
-
86
|
923
|
-
87
|
924
|
-
88
|
925
939
|
89
|
926
940
|
90
|
927
941
|
91
|
@@ -955,10 +969,21 @@ ignored.</p>
|
|
955
969
|
119
|
956
970
|
120
|
957
971
|
121
|
958
|
-
122
|
972
|
+
122
|
973
|
+
123
|
974
|
+
124
|
975
|
+
125
|
976
|
+
126
|
977
|
+
127
|
978
|
+
128
|
979
|
+
129
|
980
|
+
130
|
981
|
+
131
|
982
|
+
132
|
983
|
+
133</pre>
|
959
984
|
</td>
|
960
985
|
<td>
|
961
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
986
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 89</span>
|
962
987
|
|
963
988
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_content'>parse_content</span> <span class='id identifier rubyid_opts'>opts</span><span class='comma'>,</span> <span class='op'>&</span><span class='id identifier rubyid_filter'>filter</span>
|
964
989
|
<span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
|
@@ -966,7 +991,8 @@ ignored.</p>
|
|
966
991
|
<span class='label'>selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
967
992
|
<span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
|
968
993
|
<span class='label'>header_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
969
|
-
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
994
|
+
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
995
|
+
<span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
|
970
996
|
<span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
|
971
997
|
|
972
998
|
<span class='comment'># Setup config
|
@@ -975,10 +1001,13 @@ ignored.</p>
|
|
975
1001
|
<span class='id identifier rubyid_first'>first</span> <span class='op'>=</span> <span class='id identifier rubyid_first_row_header'>first_row_header</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
|
976
1002
|
<span class='id identifier rubyid_header_map'>header_map</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_map</span><span class='rbracket'>]</span>
|
977
1003
|
<span class='id identifier rubyid_column_parsers'>column_parsers</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_parsers</span><span class='rbracket'>]</span>
|
1004
|
+
<span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
|
978
1005
|
|
979
1006
|
<span class='comment'># Get and parse rows
|
980
1007
|
</span> <span class='id identifier rubyid_html_rows'>html_rows</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_css'>css</span><span class='lparen'>(</span><span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:selector</span><span class='rbracket'>]</span><span class='rparen'>)</span>
|
981
1008
|
<span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_row'>row</span><span class='op'>|</span>
|
1009
|
+
<span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>&&</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>==</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>text</span><span class='tstring_end'>'</span></span>
|
1010
|
+
|
982
1011
|
<span class='comment'># First row header validation
|
983
1012
|
</span> <span class='kw'>if</span> <span class='id identifier rubyid_first'>first</span> <span class='op'>&&</span> <span class='id identifier rubyid_first_row_header'>first_row_header</span>
|
984
1013
|
<span class='id identifier rubyid_first'>first</span> <span class='op'>=</span> <span class='kw'>false</span>
|
@@ -989,7 +1018,9 @@ ignored.</p>
|
|
989
1018
|
</span> <span class='id identifier rubyid_row_data'>row_data</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
990
1019
|
<span class='id identifier rubyid_header_map'>header_map</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_key'>key</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
|
991
1020
|
<span class='comment'># Parse column html with default or custom parser
|
992
|
-
</span> <span class='id identifier
|
1021
|
+
</span> <span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span>
|
1022
|
+
<span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_select'>select</span><span class='lbrace'>{</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>!=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>text</span><span class='tstring_end'>'</span></span><span class='rbrace'>}</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span>
|
1023
|
+
<span class='id identifier rubyid_child_element'>child_element</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='lbracket'>[</span><span class='id identifier rubyid_index'>index</span><span class='rbracket'>]</span>
|
993
1024
|
<span class='id identifier rubyid_column_parsers'>column_parsers</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span>
|
994
1025
|
<span class='id identifier rubyid_default_parser'>default_parser</span><span class='lparen'>(</span><span class='id identifier rubyid_child_element'>child_element</span><span class='comma'>,</span> <span class='id identifier rubyid_row_data'>row_data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span><span class='rparen'>)</span> <span class='op'>:</span>
|
995
1026
|
<span class='id identifier rubyid_column_parsers'>column_parsers</span><span class='lbracket'>[</span><span class='id identifier rubyid_key'>key</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_call'>call</span><span class='lparen'>(</span><span class='id identifier rubyid_child_element'>child_element</span><span class='comma'>,</span> <span class='id identifier rubyid_row_data'>row_data</span><span class='comma'>,</span> <span class='id identifier rubyid_key'>key</span><span class='rparen'>)</span>
|
@@ -1106,6 +1137,22 @@ ignored.</p>
|
|
1106
1137
|
— <div class='inline'>
|
1107
1138
|
<p>If true then selector first matching row will be used as header for
|
1108
1139
|
parsing.</p>
|
1140
|
+
</div>
|
1141
|
+
|
1142
|
+
</li>
|
1143
|
+
|
1144
|
+
<li>
|
1145
|
+
<span class="name">:ignore_text_nodes</span>
|
1146
|
+
<span class="type">(<tt>Boolean</tt>)</span>
|
1147
|
+
<span class="default">
|
1148
|
+
|
1149
|
+
— default:
|
1150
|
+
<tt>true</tt>
|
1151
|
+
|
1152
|
+
</span>
|
1153
|
+
|
1154
|
+
— <div class='inline'>
|
1155
|
+
<p>Ignore text nodes when retriving header cells and rows.</p>
|
1109
1156
|
</div>
|
1110
1157
|
|
1111
1158
|
</li>
|
@@ -1138,20 +1185,6 @@ parsing.</p>
|
|
1138
1185
|
<pre class="lines">
|
1139
1186
|
|
1140
1187
|
|
1141
|
-
152
|
1142
|
-
153
|
1143
|
-
154
|
1144
|
-
155
|
1145
|
-
156
|
1146
|
-
157
|
1147
|
-
158
|
1148
|
-
159
|
1149
|
-
160
|
1150
|
-
161
|
1151
|
-
162
|
1152
|
-
163
|
1153
|
-
164
|
1154
|
-
165
|
1155
1188
|
166
|
1156
1189
|
167
|
1157
1190
|
168
|
@@ -1166,21 +1199,43 @@ parsing.</p>
|
|
1166
1199
|
177
|
1167
1200
|
178
|
1168
1201
|
179
|
1169
|
-
180
|
1202
|
+
180
|
1203
|
+
181
|
1204
|
+
182
|
1205
|
+
183
|
1206
|
+
184
|
1207
|
+
185
|
1208
|
+
186
|
1209
|
+
187
|
1210
|
+
188
|
1211
|
+
189
|
1212
|
+
190
|
1213
|
+
191
|
1214
|
+
192
|
1215
|
+
193
|
1216
|
+
194
|
1217
|
+
195
|
1218
|
+
196
|
1219
|
+
197
|
1220
|
+
198
|
1221
|
+
199
|
1222
|
+
200</pre>
|
1170
1223
|
</td>
|
1171
1224
|
<td>
|
1172
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
1225
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 166</span>
|
1173
1226
|
|
1174
1227
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_header_map'>parse_header_map</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
1175
1228
|
<span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
|
1176
1229
|
<span class='label'>html:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1177
1230
|
<span class='label'>selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1178
1231
|
<span class='label'>column_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1179
|
-
<span class='label'>first_row_header:</span> <span class='kw'>false</span>
|
1232
|
+
<span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
|
1233
|
+
<span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
|
1180
1234
|
<span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
|
1181
1235
|
|
1182
1236
|
<span class='comment'># Setup config
|
1183
1237
|
</span> <span class='id identifier rubyid_dictionary'>dictionary</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_key_label_map</span><span class='rbracket'>]</span>
|
1238
|
+
<span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>=</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
|
1184
1239
|
<span class='id identifier rubyid_data'>data</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='rbracket'>]</span>
|
1185
1240
|
<span class='id identifier rubyid_column_map'>column_map</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
1186
1241
|
|
@@ -1189,8 +1244,12 @@ parsing.</p>
|
|
1189
1244
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1190
1245
|
<span class='id identifier rubyid_html_rows'>html_rows</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_first'>first</span><span class='rbracket'>]</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
|
1191
1246
|
<span class='id identifier rubyid_html_rows'>html_rows</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_row'>row</span><span class='op'>|</span>
|
1247
|
+
<span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span> <span class='op'>&&</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>==</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>text</span><span class='tstring_end'>'</span></span>
|
1248
|
+
|
1192
1249
|
<span class='id identifier rubyid_column_map'>column_map</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
1193
|
-
<span class='id identifier
|
1250
|
+
<span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_row'>row</span><span class='period'>.</span><span class='id identifier rubyid_children'>children</span>
|
1251
|
+
<span class='id identifier rubyid_children'>children</span> <span class='op'>=</span> <span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_select'>select</span><span class='lbrace'>{</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='op'>|</span><span class='id identifier rubyid_i'>i</span><span class='period'>.</span><span class='id identifier rubyid_name'>name</span> <span class='op'>!=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>text</span><span class='tstring_end'>'</span></span><span class='rbrace'>}</span> <span class='kw'>if</span> <span class='id identifier rubyid_ignore_text_nodes'>ignore_text_nodes</span>
|
1252
|
+
<span class='id identifier rubyid_children'>children</span><span class='period'>.</span><span class='id identifier rubyid_each_with_index'>each_with_index</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_index'>index</span><span class='op'>|</span>
|
1194
1253
|
<span class='comment'># Parse and map column header
|
1195
1254
|
</span> <span class='id identifier rubyid_column_key'>column_key</span> <span class='op'>=</span> <span class='id identifier rubyid_translate_label_to_key'>translate_label_to_key</span> <span class='id identifier rubyid_col'>col</span><span class='comma'>,</span> <span class='id identifier rubyid_dictionary'>dictionary</span>
|
1196
1255
|
<span class='kw'>next</span> <span class='kw'>if</span> <span class='id identifier rubyid_column_key'>column_key</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
@@ -1336,6 +1395,22 @@ parsing.</p>
|
|
1336
1395
|
|
1337
1396
|
— <div class='inline'>
|
1338
1397
|
<p>Custom column parsers for advance data extraction.</p>
|
1398
|
+
</div>
|
1399
|
+
|
1400
|
+
</li>
|
1401
|
+
|
1402
|
+
<li>
|
1403
|
+
<span class="name">:ignore_text_nodes</span>
|
1404
|
+
<span class="type">(<tt>Boolean</tt>)</span>
|
1405
|
+
<span class="default">
|
1406
|
+
|
1407
|
+
— default:
|
1408
|
+
<tt>true</tt>
|
1409
|
+
|
1410
|
+
</span>
|
1411
|
+
|
1412
|
+
— <div class='inline'>
|
1413
|
+
<p>Ignore text nodes when retriving cells and rows.</p>
|
1339
1414
|
</div>
|
1340
1415
|
|
1341
1416
|
</li>
|
@@ -1443,32 +1518,35 @@ parsing.</p>
|
|
1443
1518
|
<pre class="lines">
|
1444
1519
|
|
1445
1520
|
|
1446
|
-
|
1447
|
-
|
1448
|
-
|
1449
|
-
|
1450
|
-
|
1451
|
-
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
|
1468
|
-
|
1521
|
+
226
|
1522
|
+
227
|
1523
|
+
228
|
1524
|
+
229
|
1525
|
+
230
|
1526
|
+
231
|
1527
|
+
232
|
1528
|
+
233
|
1529
|
+
234
|
1530
|
+
235
|
1531
|
+
236
|
1532
|
+
237
|
1533
|
+
238
|
1534
|
+
239
|
1535
|
+
240
|
1536
|
+
241
|
1537
|
+
242
|
1538
|
+
243
|
1539
|
+
244
|
1540
|
+
245
|
1541
|
+
246
|
1542
|
+
247
|
1543
|
+
248
|
1544
|
+
249
|
1545
|
+
250
|
1546
|
+
251</pre>
|
1469
1547
|
</td>
|
1470
1548
|
<td>
|
1471
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
1549
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 226</span>
|
1472
1550
|
|
1473
1551
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_table'>parse_table</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span> <span class='op'>&</span><span class='id identifier rubyid_filter'>filter</span>
|
1474
1552
|
<span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
|
@@ -1477,19 +1555,22 @@ parsing.</p>
|
|
1477
1555
|
<span class='label'>header_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1478
1556
|
<span class='label'>content_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1479
1557
|
<span class='label'>first_row_header:</span> <span class='kw'>false</span><span class='comma'>,</span>
|
1480
|
-
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
1558
|
+
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1559
|
+
<span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
|
1481
1560
|
<span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
|
1482
1561
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1483
1562
|
<span class='id identifier rubyid_header_map'>header_map</span> <span class='op'>=</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_header_map'>parse_header_map</span> <span class='label'>html:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1484
1563
|
<span class='label'>selector:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_selector</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1485
1564
|
<span class='label'>column_key_label_map:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:header_key_label_map</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1486
|
-
<span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span>
|
1565
|
+
<span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1566
|
+
<span class='label'>ignore_text_nodes:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span>
|
1487
1567
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1488
1568
|
<span class='id identifier rubyid_data'>data</span> <span class='op'>=</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_content'>parse_content</span> <span class='label'>html:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1489
1569
|
<span class='label'>selector:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:content_selector</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1490
1570
|
<span class='label'>header_map:</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='comma'>,</span>
|
1491
1571
|
<span class='label'>first_row_header:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:first_row_header</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1492
1572
|
<span class='label'>column_parsers:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:column_parsers</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1573
|
+
<span class='label'>ignore_text_nodes:</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:ignore_text_nodes</span><span class='rbracket'>]</span><span class='comma'>,</span>
|
1493
1574
|
<span class='op'>&</span><span class='id identifier rubyid_filter'>filter</span>
|
1494
1575
|
<span class='lbrace'>{</span><span class='label'>header_map:</span> <span class='id identifier rubyid_header_map'>header_map</span><span class='comma'>,</span> <span class='label'>data:</span> <span class='id identifier rubyid_data'>data</span><span class='rbrace'>}</span>
|
1495
1576
|
<span class='kw'>end</span></pre>
|
@@ -1625,6 +1706,22 @@ parsing.</p>
|
|
1625
1706
|
|
1626
1707
|
— <div class='inline'>
|
1627
1708
|
<p>Custom column parsers for advance data extraction.</p>
|
1709
|
+
</div>
|
1710
|
+
|
1711
|
+
</li>
|
1712
|
+
|
1713
|
+
<li>
|
1714
|
+
<span class="name">:ignore_text_nodes</span>
|
1715
|
+
<span class="type">(<tt>Boolean</tt>)</span>
|
1716
|
+
<span class="default">
|
1717
|
+
|
1718
|
+
— default:
|
1719
|
+
<tt>true</tt>
|
1720
|
+
|
1721
|
+
</span>
|
1722
|
+
|
1723
|
+
— <div class='inline'>
|
1724
|
+
<p>Ignore text nodes when retriving cells and rows.</p>
|
1628
1725
|
</div>
|
1629
1726
|
|
1630
1727
|
</li>
|
@@ -1732,42 +1829,43 @@ parsing.</p>
|
|
1732
1829
|
<pre class="lines">
|
1733
1830
|
|
1734
1831
|
|
1735
|
-
249
|
1736
|
-
250
|
1737
|
-
251
|
1738
|
-
252
|
1739
|
-
253
|
1740
|
-
254
|
1741
|
-
255
|
1742
|
-
256
|
1743
|
-
257
|
1744
|
-
258
|
1745
|
-
259
|
1746
|
-
260
|
1747
|
-
261
|
1748
|
-
262
|
1749
|
-
263
|
1750
|
-
264
|
1751
|
-
265
|
1752
|
-
266
|
1753
|
-
267
|
1754
|
-
268
|
1755
|
-
269
|
1756
|
-
270
|
1757
|
-
271
|
1758
|
-
272
|
1759
|
-
273
|
1760
|
-
274
|
1761
|
-
275
|
1762
1832
|
276
|
1763
1833
|
277
|
1764
1834
|
278
|
1765
1835
|
279
|
1766
1836
|
280
|
1767
|
-
281
|
1837
|
+
281
|
1838
|
+
282
|
1839
|
+
283
|
1840
|
+
284
|
1841
|
+
285
|
1842
|
+
286
|
1843
|
+
287
|
1844
|
+
288
|
1845
|
+
289
|
1846
|
+
290
|
1847
|
+
291
|
1848
|
+
292
|
1849
|
+
293
|
1850
|
+
294
|
1851
|
+
295
|
1852
|
+
296
|
1853
|
+
297
|
1854
|
+
298
|
1855
|
+
299
|
1856
|
+
300
|
1857
|
+
301
|
1858
|
+
302
|
1859
|
+
303
|
1860
|
+
304
|
1861
|
+
305
|
1862
|
+
306
|
1863
|
+
307
|
1864
|
+
308
|
1865
|
+
309</pre>
|
1768
1866
|
</td>
|
1769
1867
|
<td>
|
1770
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
1868
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 276</span>
|
1771
1869
|
|
1772
1870
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_parse_vertical_table'>parse_vertical_table</span> <span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span> <span class='op'>&</span><span class='id identifier rubyid_filter'>filter</span>
|
1773
1871
|
<span class='id identifier rubyid_opts'>opts</span> <span class='op'>=</span> <span class='lbrace'>{</span>
|
@@ -1776,7 +1874,8 @@ parsing.</p>
|
|
1776
1874
|
<span class='label'>header_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1777
1875
|
<span class='label'>header_key_label_map:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1778
1876
|
<span class='label'>content_selector:</span> <span class='kw'>nil</span><span class='comma'>,</span>
|
1779
|
-
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span>
|
1877
|
+
<span class='label'>column_parsers:</span> <span class='lbrace'>{</span><span class='rbrace'>}</span><span class='comma'>,</span>
|
1878
|
+
<span class='label'>ignore_text_nodes:</span> <span class='kw'>true</span>
|
1780
1879
|
<span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_merge'>merge</span> <span class='id identifier rubyid_opts'>opts</span>
|
1781
1880
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_opts'>opts</span><span class='lbracket'>[</span><span class='symbol'>:html</span><span class='rbracket'>]</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1782
1881
|
|
@@ -1810,7 +1909,7 @@ parsing.</p>
|
|
1810
1909
|
<div class="method_details ">
|
1811
1910
|
<h3 class="signature " id="strip-class_method">
|
1812
1911
|
|
1813
|
-
.<strong>strip</strong>(raw_text) ⇒ <tt>String</tt><sup>?</sup>
|
1912
|
+
.<strong>strip</strong>(raw_text, orig_encoding = 'ASCII') ⇒ <tt>String</tt><sup>?</sup>
|
1814
1913
|
|
1815
1914
|
|
1816
1915
|
|
@@ -1819,7 +1918,10 @@ parsing.</p>
|
|
1819
1918
|
</h3><div class="docstring">
|
1820
1919
|
<div class="discussion">
|
1821
1920
|
|
1822
|
-
<p>Strip a value
|
1921
|
+
<p>Strip a value by trimming spaces, reducing secuential spaces into a</p>
|
1922
|
+
|
1923
|
+
<pre class="code ruby"><code class="ruby">single space, decode HTML entities and change encoding to UTF-8.
|
1924
|
+
</code></pre>
|
1823
1925
|
|
1824
1926
|
|
1825
1927
|
</div>
|
@@ -1840,6 +1942,24 @@ parsing.</p>
|
|
1840
1942
|
—
|
1841
1943
|
<div class='inline'>
|
1842
1944
|
<p>Text to strip.</p>
|
1945
|
+
</div>
|
1946
|
+
|
1947
|
+
</li>
|
1948
|
+
|
1949
|
+
<li>
|
1950
|
+
|
1951
|
+
<span class='name'>orig_encoding</span>
|
1952
|
+
|
1953
|
+
|
1954
|
+
<span class='type'>(<tt>String</tt>)</span>
|
1955
|
+
|
1956
|
+
|
1957
|
+
<em class="default">(defaults to: <tt>'ASCII'</tt>)</em>
|
1958
|
+
|
1959
|
+
|
1960
|
+
—
|
1961
|
+
<div class='inline'>
|
1962
|
+
<p>Text original encoding.</p>
|
1843
1963
|
</div>
|
1844
1964
|
|
1845
1965
|
</li>
|
@@ -1871,8 +1991,6 @@ parsing.</p>
|
|
1871
1991
|
<pre class="lines">
|
1872
1992
|
|
1873
1993
|
|
1874
|
-
42
|
1875
|
-
43
|
1876
1994
|
44
|
1877
1995
|
45
|
1878
1996
|
46
|
@@ -1882,21 +2000,23 @@ parsing.</p>
|
|
1882
2000
|
50
|
1883
2001
|
51
|
1884
2002
|
52
|
1885
|
-
53
|
2003
|
+
53
|
2004
|
+
54
|
2005
|
+
55</pre>
|
1886
2006
|
</td>
|
1887
2007
|
<td>
|
1888
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
2008
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 44</span>
|
1889
2009
|
|
1890
|
-
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_raw_text'>raw_text</span>
|
2010
|
+
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='comma'>,</span> <span class='id identifier rubyid_orig_encoding'>orig_encoding</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>ASCII</span><span class='tstring_end'>'</span></span>
|
1891
2011
|
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
1892
2012
|
<span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span> <span class='kw'>unless</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_is_a?'>is_a?</span> <span class='const'>String</span>
|
1893
2013
|
<span class='id identifier rubyid_regex'>regex</span> <span class='op'>=</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>(\s|\u3000|\u00a0)+</span><span class='regexp_end'>/</span></span>
|
1894
2014
|
<span class='id identifier rubyid_good_encoding'>good_encoding</span> <span class='op'>=</span> <span class='lparen'>(</span><span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=~</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\u3000</span><span class='regexp_end'>/</span></span> <span class='op'>||</span> <span class='kw'>true</span><span class='rparen'>)</span> <span class='kw'>rescue</span> <span class='kw'>false</span>
|
1895
2015
|
<span class='kw'>unless</span> <span class='id identifier rubyid_good_encoding'>good_encoding</span>
|
1896
|
-
<span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_force_encoding'>force_encoding</span><span class='lparen'>(</span><span class='
|
2016
|
+
<span class='id identifier rubyid_raw_text'>raw_text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_force_encoding'>force_encoding</span><span class='lparen'>(</span><span class='id identifier rubyid_orig_encoding'>orig_encoding</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_encode'>encode</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>UTF-8</span><span class='tstring_end'>'</span></span><span class='comma'>,</span> <span class='label'>invalid:</span> <span class='symbol'>:replace</span><span class='comma'>,</span> <span class='label'>undef:</span> <span class='symbol'>:replace</span><span class='rparen'>)</span>
|
1897
2017
|
<span class='id identifier rubyid_regex'>regex</span> <span class='op'>=</span> <span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>(\s|\u3000|\u00a0|\u00c2\u00a0)+</span><span class='regexp_end'>/</span></span>
|
1898
2018
|
<span class='kw'>end</span>
|
1899
|
-
<span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='
|
2019
|
+
<span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_raw_text'>raw_text</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='id identifier rubyid_regex'>regex</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'> </span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span>
|
1900
2020
|
<span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span> <span class='kw'>nil</span> <span class='op'>:</span> <span class='id identifier rubyid_decode_html'>decode_html</span><span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span><span class='rparen'>)</span>
|
1901
2021
|
<span class='kw'>end</span></pre>
|
1902
2022
|
</td>
|
@@ -1984,25 +2104,27 @@ parsing.</p>
|
|
1984
2104
|
<pre class="lines">
|
1985
2105
|
|
1986
2106
|
|
1987
|
-
|
1988
|
-
|
1989
|
-
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
1993
|
-
|
1994
|
-
|
2107
|
+
142
|
2108
|
+
143
|
2109
|
+
144
|
2110
|
+
145
|
2111
|
+
146
|
2112
|
+
147
|
2113
|
+
148
|
2114
|
+
149
|
2115
|
+
150</pre>
|
1995
2116
|
</td>
|
1996
2117
|
<td>
|
1997
|
-
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line
|
2118
|
+
<pre class="code"><span class="info file"># File 'lib/ae_easy/text.rb', line 142</span>
|
1998
2119
|
|
1999
2120
|
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_translate_label_to_key'>translate_label_to_key</span> <span class='id identifier rubyid_element'>element</span><span class='comma'>,</span> <span class='id identifier rubyid_label_map'>label_map</span>
|
2000
|
-
<span class='
|
2001
|
-
<span class='id identifier
|
2002
|
-
<span class='id identifier
|
2121
|
+
<span class='kw'>return</span> <span class='kw'>nil</span> <span class='kw'>if</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
2122
|
+
<span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//i</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_remove'>remove</span> <span class='kw'>if</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_search'>search</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//i</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_count'>count</span> <span class='op'>></span> <span class='int'>0</span>
|
2123
|
+
<span class='id identifier rubyid_text'>text</span> <span class='op'>=</span> <span class='id identifier rubyid_strip'>strip</span> <span class='id identifier rubyid_element'>element</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
2124
|
+
<span class='id identifier rubyid_key_pair'>key_pair</span> <span class='op'>=</span> <span class='id identifier rubyid_label_map'>label_map</span><span class='period'>.</span><span class='id identifier rubyid_find'>find</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_k'>k</span><span class='comma'>,</span><span class='id identifier rubyid_v'>v</span><span class='op'>|</span>
|
2003
2125
|
<span class='id identifier rubyid_v'>v</span><span class='period'>.</span><span class='id identifier rubyid_is_a?'>is_a?</span><span class='lparen'>(</span><span class='const'>Regexp</span><span class='rparen'>)</span> <span class='op'>?</span> <span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span> <span class='op'>=~</span> <span class='id identifier rubyid_v'>v</span><span class='rparen'>)</span> <span class='op'>:</span> <span class='lparen'>(</span><span class='id identifier rubyid_text'>text</span> <span class='op'>==</span> <span class='id identifier rubyid_v'>v</span><span class='rparen'>)</span>
|
2004
|
-
<span class='kw'>end</span
|
2005
|
-
<span class='id identifier rubyid_key'>key</span>
|
2126
|
+
<span class='kw'>end</span>
|
2127
|
+
<span class='id identifier rubyid_key'>key</span> <span class='op'>=</span> <span class='id identifier rubyid_key_pair'>key_pair</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>?</span> <span class='kw'>nil</span> <span class='op'>:</span> <span class='id identifier rubyid_key_pair'>key_pair</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
|
2006
2128
|
<span class='kw'>end</span></pre>
|
2007
2129
|
</td>
|
2008
2130
|
</tr>
|
@@ -2014,7 +2136,7 @@ parsing.</p>
|
|
2014
2136
|
</div>
|
2015
2137
|
|
2016
2138
|
<div id="footer">
|
2017
|
-
Generated on
|
2139
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
2018
2140
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
2019
2141
|
0.9.18 (ruby-2.5.3).
|
2020
2142
|
</div>
|
data/doc/_index.html
CHANGED
@@ -112,7 +112,7 @@
|
|
112
112
|
</div>
|
113
113
|
|
114
114
|
<div id="footer">
|
115
|
-
Generated on
|
115
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
116
116
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
117
117
|
0.9.18 (ruby-2.5.3).
|
118
118
|
</div>
|
data/doc/file.README.html
CHANGED
@@ -81,7 +81,7 @@ href="http://rubydoc.org/gems/ae_easy-text/frames">here</a>.</p>
|
|
81
81
|
</div></div>
|
82
82
|
|
83
83
|
<div id="footer">
|
84
|
-
Generated on
|
84
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
85
85
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
86
86
|
0.9.18 (ruby-2.5.3).
|
87
87
|
</div>
|
data/doc/index.html
CHANGED
@@ -81,7 +81,7 @@ href="http://rubydoc.org/gems/ae_easy-text/frames">here</a>.</p>
|
|
81
81
|
</div></div>
|
82
82
|
|
83
83
|
<div id="footer">
|
84
|
-
Generated on
|
84
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
85
85
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
86
86
|
0.9.18 (ruby-2.5.3).
|
87
87
|
</div>
|
@@ -100,7 +100,7 @@
|
|
100
100
|
</div>
|
101
101
|
|
102
102
|
<div id="footer">
|
103
|
-
Generated on
|
103
|
+
Generated on Fri Mar 8 17:26:54 2019 by
|
104
104
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
105
105
|
0.9.18 (ruby-2.5.3).
|
106
106
|
</div>
|
data/lib/ae_easy/text.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'cgi'
|
2
2
|
require 'json'
|
3
3
|
require 'digest/sha1'
|
4
|
-
require 'ae_easy
|
4
|
+
require 'ae_easy/core'
|
5
5
|
require 'ae_easy/text/version'
|
6
6
|
|
7
7
|
module AeEasy
|
@@ -34,21 +34,23 @@ module AeEasy
|
|
34
34
|
CGI.unescapeHTML text
|
35
35
|
end
|
36
36
|
|
37
|
-
# Strip a value
|
37
|
+
# Strip a value by trimming spaces, reducing secuential spaces into a
|
38
|
+
# single space, decode HTML entities and change encoding to UTF-8.
|
38
39
|
#
|
39
40
|
# @param [String,Object,nil] raw_text Text to strip.
|
41
|
+
# @param [String] orig_encoding Text original encoding.
|
40
42
|
#
|
41
43
|
# @return [String,nil] `nil` when +raw_text+ is nil, else `String`.
|
42
|
-
def self.strip raw_text
|
44
|
+
def self.strip raw_text, orig_encoding = 'ASCII'
|
43
45
|
return nil if raw_text.nil?
|
44
46
|
raw_text = raw_text.to_s unless raw_text.is_a? String
|
45
47
|
regex = /(\s|\u3000|\u00a0)+/
|
46
48
|
good_encoding = (raw_text =~ /\u3000/ || true) rescue false
|
47
49
|
unless good_encoding
|
48
|
-
raw_text = raw_text.force_encoding(
|
50
|
+
raw_text = raw_text.force_encoding(orig_encoding).encode('UTF-8', invalid: :replace, undef: :replace)
|
49
51
|
regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
|
50
52
|
end
|
51
|
-
text = raw_text
|
53
|
+
text = raw_text.gsub(regex, ' ').strip
|
52
54
|
text.nil? ? nil : decode_html(text)
|
53
55
|
end
|
54
56
|
|
@@ -58,8 +60,9 @@ module AeEasy
|
|
58
60
|
# @param [Hash] data Data hash to save parsed data into.
|
59
61
|
# @param [String,Symbol] key Header column key being parsed.
|
60
62
|
def self.default_parser cell_element, data, key
|
61
|
-
cell_element
|
62
|
-
|
63
|
+
return if cell_element.nil?
|
64
|
+
cell_element.search('//i').remove if cell_element.search('//i').count > 0
|
65
|
+
data[key] = strip cell_element.text
|
63
66
|
end
|
64
67
|
|
65
68
|
# Parse row data matching a selector using a header map to translate
|
@@ -74,6 +77,8 @@ module AeEasy
|
|
74
77
|
# index dictionary.
|
75
78
|
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
76
79
|
# Custom column parsers for advance data extraction.
|
80
|
+
# @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
|
81
|
+
# retriving content cells and rows.
|
77
82
|
#
|
78
83
|
# @yieldparam [Hash{Symbol,String => Object}] data Parsed row data.
|
79
84
|
# @yieldparam [Array] row Raw row data.
|
@@ -87,7 +92,8 @@ module AeEasy
|
|
87
92
|
selector: nil,
|
88
93
|
first_row_header: false,
|
89
94
|
header_map: {},
|
90
|
-
column_parsers: {}
|
95
|
+
column_parsers: {},
|
96
|
+
ignore_text_nodes: true
|
91
97
|
}.merge opts
|
92
98
|
|
93
99
|
# Setup config
|
@@ -96,10 +102,13 @@ module AeEasy
|
|
96
102
|
first = first_row_header = opts[:first_row_header]
|
97
103
|
header_map = opts[:header_map]
|
98
104
|
column_parsers = opts[:column_parsers]
|
105
|
+
ignore_text_nodes = opts[:ignore_text_nodes]
|
99
106
|
|
100
107
|
# Get and parse rows
|
101
108
|
html_rows = opts[:html].css(opts[:selector])
|
102
109
|
html_rows.each do |row|
|
110
|
+
next if ignore_text_nodes && row.name == 'text'
|
111
|
+
|
103
112
|
# First row header validation
|
104
113
|
if first && first_row_header
|
105
114
|
first = false
|
@@ -110,7 +119,9 @@ module AeEasy
|
|
110
119
|
row_data = {}
|
111
120
|
header_map.each do |key, index|
|
112
121
|
# Parse column html with default or custom parser
|
113
|
-
|
122
|
+
children = row.children
|
123
|
+
children = children.select{|i|i.name != 'text'} if ignore_text_nodes
|
124
|
+
child_element = children[index]
|
114
125
|
column_parsers[key].nil? ?
|
115
126
|
default_parser(child_element, row_data, key) :
|
116
127
|
column_parsers[key].call(child_element, row_data, key)
|
@@ -129,12 +140,13 @@ module AeEasy
|
|
129
140
|
#
|
130
141
|
# @return [Symbol,String] Translated key.
|
131
142
|
def self.translate_label_to_key element, label_map
|
132
|
-
element
|
133
|
-
|
134
|
-
|
143
|
+
return nil if element.nil?
|
144
|
+
element.search('//i').remove if element.search('//i').count > 0
|
145
|
+
text = strip element.text
|
146
|
+
key_pair = label_map.find do |k,v|
|
135
147
|
v.is_a?(Regexp) ? (text =~ v) : (text == v)
|
136
|
-
end
|
137
|
-
key
|
148
|
+
end
|
149
|
+
key = key_pair.nil? ? nil : key_pair[0]
|
138
150
|
end
|
139
151
|
|
140
152
|
# Parse header from selector and create a header map to match a column key
|
@@ -147,6 +159,8 @@ module AeEasy
|
|
147
159
|
# Key vs. label dictionary.
|
148
160
|
# @option opts [Boolean] :first_row_header (false) If true then selector
|
149
161
|
# first matching row will be used as header for parsing.
|
162
|
+
# @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
|
163
|
+
# retriving header cells and rows.
|
150
164
|
#
|
151
165
|
# @return [Hash{Symbol,String => Integer},nil] Key vs. column index map.
|
152
166
|
def self.parse_header_map opts = {}
|
@@ -154,11 +168,13 @@ module AeEasy
|
|
154
168
|
html: nil,
|
155
169
|
selector: nil,
|
156
170
|
column_key_label_map: {},
|
157
|
-
first_row_header: false
|
171
|
+
first_row_header: false,
|
172
|
+
ignore_text_nodes: true
|
158
173
|
}.merge opts
|
159
174
|
|
160
175
|
# Setup config
|
161
176
|
dictionary = opts[:column_key_label_map]
|
177
|
+
ignore_text_nodes = opts[:ignore_text_nodes]
|
162
178
|
data = []
|
163
179
|
column_map = nil
|
164
180
|
|
@@ -167,8 +183,12 @@ module AeEasy
|
|
167
183
|
return nil if html_rows.nil?
|
168
184
|
html_rows = [html_rows.first] if opts[:first_row_header]
|
169
185
|
html_rows.each do |row|
|
186
|
+
next if ignore_text_nodes && row.name == 'text'
|
187
|
+
|
170
188
|
column_map = {}
|
171
|
-
row.children
|
189
|
+
children = row.children
|
190
|
+
children = children.select{|i|i.name != 'text'} if ignore_text_nodes
|
191
|
+
children.each_with_index do |col, index|
|
172
192
|
# Parse and map column header
|
173
193
|
column_key = translate_label_to_key col, dictionary
|
174
194
|
next if column_key.nil?
|
@@ -192,6 +212,8 @@ module AeEasy
|
|
192
212
|
# first matching row will be used as header for parsing.
|
193
213
|
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
194
214
|
# Custom column parsers for advance data extraction.
|
215
|
+
# @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
|
216
|
+
# retriving cells and rows.
|
195
217
|
#
|
196
218
|
# @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
|
197
219
|
# @yieldparam [Array] row Raw content row data.
|
@@ -208,19 +230,22 @@ module AeEasy
|
|
208
230
|
header_key_label_map: {},
|
209
231
|
content_selector: nil,
|
210
232
|
first_row_header: false,
|
211
|
-
column_parsers: {}
|
233
|
+
column_parsers: {},
|
234
|
+
ignore_text_nodes: true
|
212
235
|
}.merge opts
|
213
236
|
return nil if opts[:html].nil?
|
214
237
|
header_map = self.parse_header_map html: opts[:html],
|
215
238
|
selector: opts[:header_selector],
|
216
239
|
column_key_label_map: opts[:header_key_label_map],
|
217
|
-
first_row_header: opts[:first_row_header]
|
240
|
+
first_row_header: opts[:first_row_header],
|
241
|
+
ignore_text_nodes: opts[:ignore_text_nodes]
|
218
242
|
return nil if header_map.nil?
|
219
243
|
data = self.parse_content html: opts[:html],
|
220
244
|
selector: opts[:content_selector],
|
221
245
|
header_map: header_map,
|
222
246
|
first_row_header: opts[:first_row_header],
|
223
247
|
column_parsers: opts[:column_parsers],
|
248
|
+
ignore_text_nodes: opts[:ignore_text_nodes],
|
224
249
|
&filter
|
225
250
|
{header_map: header_map, data: data}
|
226
251
|
end
|
@@ -237,6 +262,8 @@ module AeEasy
|
|
237
262
|
# @option opts [String] :content_selector Content row elements selector.
|
238
263
|
# @option opts [Hash{Symbol,String => lambda,proc}] :column_parsers ({})
|
239
264
|
# Custom column parsers for advance data extraction.
|
265
|
+
# @option opts [Boolean] :ignore_text_nodes (true) Ignore text nodes when
|
266
|
+
# retriving cells and rows.
|
240
267
|
#
|
241
268
|
# @yieldparam [Hash{Symbol,String => Object}] data Parsed content row data.
|
242
269
|
# @yieldparam [Array] row Raw content row data.
|
@@ -253,7 +280,8 @@ module AeEasy
|
|
253
280
|
header_selector: nil,
|
254
281
|
header_key_label_map: {},
|
255
282
|
content_selector: nil,
|
256
|
-
column_parsers: {}
|
283
|
+
column_parsers: {},
|
284
|
+
ignore_text_nodes: true
|
257
285
|
}.merge opts
|
258
286
|
return nil if opts[:html].nil?
|
259
287
|
|
data/lib/ae_easy/text/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ae_easy-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo Rosales
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ae_easy-core
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.1.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.1.2
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|