scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,563 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Pure-Ruby DOM. Built from the SAX tokenizer. The backing tree for
5
+ # Scrapetor::Document when the native streaming extract path isn't
6
+ # applicable (i.e. for `doc.css(...)`, `doc.at(...)`, mutation, and
7
+ # serialization).
8
+ #
9
+ # This is intentionally minimal — node types are Element / Text /
10
+ # Comment / Doctype, plus a Document root. The CSS selector engine
11
+ # lives in `dom/selectors.rb`.
12
+ module Dom
13
+ VOID = %w[
14
+ area base br col embed hr img input link meta source track wbr
15
+ ].freeze
16
+
17
+ module NodeMethods
18
+ attr_accessor :parent
19
+
20
+ def document
21
+ cur = self
22
+ cur = cur.parent while cur.respond_to?(:parent) && cur.parent
23
+ cur
24
+ end
25
+
26
+ def element?; false; end
27
+ def text?; false; end
28
+ def comment?; false; end
29
+ def doctype?; false; end
30
+
31
+ def remove
32
+ return unless @parent
33
+ @parent.children.delete(self)
34
+ @parent = nil
35
+ self
36
+ end
37
+ alias unlink remove
38
+ alias delete remove
39
+
40
+ def replace(node_or_html)
41
+ replacements = Dom.normalize_replacement(node_or_html, parent: @parent)
42
+ return self unless @parent
43
+ idx = @parent.children.index(self)
44
+ return self unless idx
45
+ replacements.each { |r| r.parent = @parent }
46
+ @parent.children[idx, 1] = replacements
47
+ @parent = nil
48
+ replacements.last
49
+ end
50
+ alias swap replace
51
+ alias replace_with replace
52
+
53
+ def add_previous_sibling(node_or_html)
54
+ return self unless @parent
55
+ nodes = Dom.normalize_replacement(node_or_html, parent: @parent)
56
+ idx = @parent.children.index(self)
57
+ return self unless idx
58
+ nodes.each { |n| n.parent = @parent }
59
+ @parent.children.insert(idx, *nodes)
60
+ nodes.last
61
+ end
62
+ alias before add_previous_sibling
63
+
64
+ def add_next_sibling(node_or_html)
65
+ return self unless @parent
66
+ nodes = Dom.normalize_replacement(node_or_html, parent: @parent)
67
+ idx = @parent.children.index(self)
68
+ return self unless idx
69
+ nodes.each { |n| n.parent = @parent }
70
+ @parent.children.insert(idx + 1, *nodes)
71
+ nodes.last
72
+ end
73
+ alias after add_next_sibling
74
+
75
+ def next_sibling
76
+ return nil unless @parent
77
+ sibs = @parent.children
78
+ idx = sibs.index(self)
79
+ idx && sibs[idx + 1]
80
+ end
81
+
82
+ def previous_sibling
83
+ return nil unless @parent
84
+ sibs = @parent.children
85
+ idx = sibs.index(self)
86
+ idx && idx > 0 ? sibs[idx - 1] : nil
87
+ end
88
+
89
+ def next_element_sibling
90
+ cur = next_sibling
91
+ cur = cur.next_sibling while cur && !cur.element?
92
+ cur
93
+ end
94
+
95
+ def previous_element_sibling
96
+ cur = previous_sibling
97
+ cur = cur.previous_sibling while cur && !cur.element?
98
+ cur
99
+ end
100
+ end
101
+
102
+ class Element
103
+ include NodeMethods
104
+
105
+ attr_accessor :name, :attributes, :children, :line
106
+
107
+ def initialize(name, attributes = {}, line: nil)
108
+ @name = name.to_s.downcase
109
+ @attributes = attributes
110
+ @children = []
111
+ @parent = nil
112
+ @line = line
113
+ end
114
+
115
+ def element?; true; end
116
+
117
+ # ----- attribute access -----
118
+
119
+ def [](key)
120
+ @attributes[key.to_s]
121
+ end
122
+
123
+ def []=(key, value)
124
+ if value.nil?
125
+ @attributes.delete(key.to_s)
126
+ else
127
+ @attributes[key.to_s] = value.to_s
128
+ end
129
+ value
130
+ end
131
+
132
+ def attribute_value(key)
133
+ self[key]
134
+ end
135
+
136
+ def remove_attribute(key)
137
+ @attributes.delete(key.to_s)
138
+ self
139
+ end
140
+
141
+ def has_attribute?(key)
142
+ @attributes.key?(key.to_s)
143
+ end
144
+
145
+ def keys
146
+ @attributes.keys
147
+ end
148
+
149
+ def values
150
+ @attributes.values
151
+ end
152
+
153
+ # ----- class manipulation -----
154
+
155
+ def classes
156
+ (self["class"] || "").split(/\s+/).reject(&:empty?)
157
+ end
158
+
159
+ def add_class(klass)
160
+ set = classes
161
+ klass.to_s.split(/\s+/).each { |c| set << c unless set.include?(c) || c.empty? }
162
+ self["class"] = set.join(" ")
163
+ self
164
+ end
165
+ alias append_class add_class
166
+
167
+ def remove_class(klass = nil)
168
+ if klass.nil?
169
+ remove_attribute("class")
170
+ else
171
+ set = classes
172
+ klass.to_s.split(/\s+/).each { |c| set.delete(c) }
173
+ if set.empty?
174
+ remove_attribute("class")
175
+ else
176
+ self["class"] = set.join(" ")
177
+ end
178
+ end
179
+ self
180
+ end
181
+
182
+ def has_class?(klass)
183
+ classes.include?(klass.to_s)
184
+ end
185
+
186
+ # ----- text / inner_html -----
187
+
188
+ def text
189
+ @children.map(&:text).join
190
+ end
191
+ alias content text
192
+ alias inner_text text
193
+
194
+ def text=(s)
195
+ @children = [Text.new(s.to_s, parent: self)]
196
+ s
197
+ end
198
+ alias content= text=
199
+
200
+ def inner_html
201
+ @children.map(&:to_html).join
202
+ end
203
+
204
+ def inner_html=(html)
205
+ nodes = Dom::Parser.fragment(html.to_s)
206
+ nodes.each { |n| n.parent = self }
207
+ @children = nodes
208
+ html
209
+ end
210
+
211
+ def outer_html
212
+ attrs = serialize_attrs
213
+ if VOID.include?(@name) && @children.empty?
214
+ "<#{@name}#{attrs}>"
215
+ else
216
+ "<#{@name}#{attrs}>#{inner_html}</#{@name}>"
217
+ end
218
+ end
219
+ alias to_html outer_html
220
+ alias to_xml outer_html
221
+ alias to_s outer_html
222
+
223
+ # ----- children / traversal -----
224
+
225
+ def add_child(node_or_html)
226
+ nodes = Dom.normalize_replacement(node_or_html, parent: self)
227
+ nodes.each { |n| n.parent = self; @children << n }
228
+ nodes.last
229
+ end
230
+ alias << add_child
231
+
232
+ def element_children
233
+ @children.select(&:element?)
234
+ end
235
+ alias elements element_children
236
+
237
+ def first_element_child
238
+ @children.find(&:element?)
239
+ end
240
+
241
+ def last_element_child
242
+ @children.reverse_each.find(&:element?)
243
+ end
244
+
245
+ # ----- selectors -----
246
+
247
+ def css(selector)
248
+ Dom::Selectors.css(self, selector)
249
+ end
250
+
251
+ def at_css(selector)
252
+ css(selector).first
253
+ end
254
+ alias at at_css
255
+ alias search css
256
+
257
+ def xpath(_expr)
258
+ # Minimal XPath support is out of scope for the pure-Ruby DOM.
259
+ # Callers that need full XPath can install nokogiri/nokolexbor
260
+ # separately and pass HTML through them.
261
+ []
262
+ end
263
+
264
+ def at_xpath(expr)
265
+ xpath(expr).first
266
+ end
267
+
268
+ # ----- node type / misc -----
269
+
270
+ def node_type; 1; end
271
+ def type; 1; end
272
+ def tag_name; @name; end
273
+ def node_name; @name; end
274
+
275
+ def path
276
+ parts = []
277
+ cur = self
278
+ while cur.is_a?(Element)
279
+ if cur["id"] && !cur["id"].empty?
280
+ parts.unshift(cur.name + "[@id='#{cur['id']}']")
281
+ break
282
+ end
283
+ idx = 1
284
+ sib = cur.previous_sibling
285
+ while sib
286
+ idx += 1 if sib.is_a?(Element) && sib.name == cur.name
287
+ sib = sib.previous_sibling
288
+ end
289
+ parts.unshift("#{cur.name}[#{idx}]")
290
+ cur = cur.parent
291
+ end
292
+ "/" + parts.join("/")
293
+ end
294
+
295
+ def matches?(selector)
296
+ document.css(selector).any? { |n| n.equal?(self) }
297
+ end
298
+
299
+ # Wrap this element in an HTML fragment (string) or another element,
300
+ # placing this element as the deepest descendant of the wrapping
301
+ # tree. Matches Nokogiri's `Node#wrap` semantics.
302
+ def wrap(html_or_node)
303
+ return self unless @parent
304
+ wrapper = case html_or_node
305
+ when String
306
+ fragment = Dom::Parser.fragment(html_or_node)
307
+ fragment.find(&:element?) || fragment.first
308
+ when Element
309
+ html_or_node
310
+ else
311
+ Dom::Parser.fragment(html_or_node.to_s).find(&:element?)
312
+ end
313
+ return self if wrapper.nil?
314
+ # Drill to the deepest first element.
315
+ deepest = wrapper
316
+ while (next_level = deepest.first_element_child)
317
+ deepest = next_level
318
+ end
319
+ # Replace self with the wrapper, then re-parent self under deepest.
320
+ idx = @parent.children.index(self)
321
+ return self unless idx
322
+ wrapper.parent = @parent
323
+ @parent.children[idx, 1] = [wrapper]
324
+ @parent = deepest
325
+ deepest.children << self
326
+ self
327
+ end
328
+
329
+ def traverse(&block)
330
+ return enum_for(:traverse) unless block_given?
331
+ yield self
332
+ @children.each do |c|
333
+ if c.respond_to?(:traverse)
334
+ c.traverse(&block)
335
+ else
336
+ yield c
337
+ end
338
+ end
339
+ self
340
+ end
341
+
342
+ def attribute_nodes
343
+ @attributes.map { |k, v| AttrNode.new(k, v, self) }
344
+ end
345
+
346
+ def attribute(name)
347
+ attribute_nodes.find { |a| a.name == name.to_s }
348
+ end
349
+
350
+ private
351
+
352
+ def serialize_attrs
353
+ @attributes.map { |k, v| %( #{k}="#{Dom.escape_attr(v)}") }.join
354
+ end
355
+ end
356
+
357
+ class Text
358
+ include NodeMethods
359
+ attr_accessor :data
360
+ def initialize(data, parent: nil)
361
+ @data = data.to_s
362
+ @parent = parent
363
+ end
364
+ def text; @data; end
365
+ def content; @data; end
366
+ def text?; true; end
367
+ def name; "#text"; end
368
+ def to_html; Dom.escape_text(@data); end
369
+ def to_s; @data; end
370
+ def node_type; 3; end
371
+ end
372
+
373
+ class Comment
374
+ include NodeMethods
375
+ attr_accessor :data
376
+ def initialize(data, parent: nil)
377
+ @data = data.to_s
378
+ @parent = parent
379
+ end
380
+ def text; ""; end
381
+ def content; @data; end
382
+ def comment?; true; end
383
+ def name; "#comment"; end
384
+ def to_html; "<!--#{@data}-->"; end
385
+ def to_s; to_html; end
386
+ def node_type; 8; end
387
+ end
388
+
389
+ class Doctype
390
+ include NodeMethods
391
+ attr_accessor :name
392
+ def initialize(name, parent: nil)
393
+ @name = name.to_s
394
+ @parent = parent
395
+ end
396
+ def text; ""; end
397
+ def content; ""; end
398
+ def doctype?; true; end
399
+ def to_html; "<!DOCTYPE #{@name}>"; end
400
+ def to_s; to_html; end
401
+ def node_type; 10; end
402
+ end
403
+
404
+ class AttrNode
405
+ attr_reader :name, :value, :owner
406
+ def initialize(name, value, owner)
407
+ @name = name
408
+ @value = value
409
+ @owner = owner
410
+ end
411
+ def to_s; "#{@name}=\"#{@value}\""; end
412
+ # Nokogiri-compat: attribute nodes expose .text / .content /
413
+ # .inner_text that return the attribute's value. Real-world code
414
+ # iterates `node.attribute_nodes` and reads `.text` on each.
415
+ def text; @value.to_s; end
416
+ alias content text
417
+ alias inner_text text
418
+ end
419
+
420
+ class Document
421
+ include NodeMethods
422
+ attr_accessor :doctype, :children
423
+
424
+ def initialize
425
+ @children = []
426
+ @doctype = nil
427
+ @parent = nil
428
+ @class_index = nil
429
+ @tag_index = nil
430
+ @id_index = nil
431
+ end
432
+
433
+ def element?; false; end
434
+ def document?; true; end
435
+ def name; "#document"; end
436
+
437
+ # Lazy structural indexes. Built on first access during a fallback
438
+ # selector evaluation so the per-query candidate set drops from
439
+ # "every element in document order" to "elements that already
440
+ # carry the anchor class / tag / id". On a 100KB document with
441
+ # ~5000 elements that's the difference between a 5ms walk and a
442
+ # ~50µs lookup.
443
+ def class_index
444
+ @class_index ||= build_indexes![:class]
445
+ end
446
+
447
+ def tag_index
448
+ @tag_index ||= build_indexes![:tag]
449
+ end
450
+
451
+ def id_index
452
+ @id_index ||= build_indexes![:id]
453
+ end
454
+
455
+ def build_indexes!
456
+ cls = Hash.new { |h, k| h[k] = [] }
457
+ tag = Hash.new { |h, k| h[k] = [] }
458
+ ids = {}
459
+ walk = ->(node) {
460
+ return unless node.respond_to?(:children)
461
+ node.children.each do |c|
462
+ next unless c.element?
463
+ tag[c.name] << c
464
+ id_attr = c["id"]
465
+ ids[id_attr] ||= c if id_attr && !id_attr.empty?
466
+ class_attr = c["class"]
467
+ if class_attr
468
+ class_attr.split(/\s+/).each { |t| cls[t] << c unless t.empty? }
469
+ end
470
+ walk.call(c)
471
+ end
472
+ }
473
+ walk.call(self)
474
+ @class_index = cls
475
+ @tag_index = tag
476
+ @id_index = ids
477
+ { class: cls, tag: tag, id: ids }
478
+ end
479
+
480
+ def root
481
+ @children.find(&:element?)
482
+ end
483
+
484
+ def html_element
485
+ @children.find { |c| c.element? && c.name == "html" } || root
486
+ end
487
+
488
+ def head
489
+ @children.flat_map { |c| c.element? ? c.css("head") : [] }.first
490
+ end
491
+
492
+ def body
493
+ @children.flat_map { |c| c.element? ? c.css("body") : [] }.first
494
+ end
495
+
496
+ def text
497
+ @children.map(&:text).join
498
+ end
499
+
500
+ def css(selector)
501
+ Dom::Selectors.css(self, selector)
502
+ end
503
+
504
+ def at_css(selector)
505
+ css(selector).first
506
+ end
507
+ alias at at_css
508
+
509
+ def xpath(_expr); []; end
510
+ def at_xpath(expr); xpath(expr).first; end
511
+
512
+ def add_child(node_or_html)
513
+ nodes = Dom.normalize_replacement(node_or_html, parent: self)
514
+ nodes.each { |n| n.parent = self; @children << n }
515
+ nodes.last
516
+ end
517
+
518
+ def to_html
519
+ out = +""
520
+ out << "<!DOCTYPE #{@doctype}>" if @doctype
521
+ @children.each { |c| out << c.to_html }
522
+ out
523
+ end
524
+ alias to_s to_html
525
+
526
+ def traverse(&block)
527
+ return enum_for(:traverse) unless block_given?
528
+ yield self
529
+ @children.each do |c|
530
+ if c.respond_to?(:traverse)
531
+ c.traverse(&block)
532
+ else
533
+ yield c
534
+ end
535
+ end
536
+ self
537
+ end
538
+ end
539
+
540
+ # ----- helpers -----
541
+
542
+ def self.escape_text(s)
543
+ s.to_s.gsub(/[&<>]/, "&" => "&amp;", "<" => "&lt;", ">" => "&gt;")
544
+ end
545
+
546
+ def self.escape_attr(s)
547
+ s.to_s.gsub(/[&<>"]/,
548
+ "&" => "&amp;",
549
+ "<" => "&lt;",
550
+ ">" => "&gt;",
551
+ '"' => "&quot;")
552
+ end
553
+
554
+ def self.normalize_replacement(input, parent:)
555
+ case input
556
+ when Element, Text, Comment, Doctype then [input]
557
+ when Array then input
558
+ when String then Dom::Parser.fragment(input)
559
+ else [Text.new(input.to_s, parent: parent)]
560
+ end
561
+ end
562
+ end
563
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Encoding detection + UTF-8 normalization.
5
+ #
6
+ # The native streaming engine treats the input as a byte stream and tags
7
+ # output strings as UTF-8. To make that honest, we transcode non-UTF-8
8
+ # input to UTF-8 in Ruby before handing it to C — using the cascade the
9
+ # HTML5 spec describes:
10
+ #
11
+ # 1. BOM — UTF-8 / UTF-16 BE/LE
12
+ # 2. <meta charset=...> in the first ~1024 bytes
13
+ # 3. <meta http-equiv="Content-Type" content="...; charset=...">
14
+ # 4. Fall back to UTF-8
15
+ #
16
+ # If the detected encoding equals UTF-8 (or close enough), we leave the
17
+ # bytes alone. Otherwise we transcode with `invalid: :replace,
18
+ # undef: :replace` so a single bad byte doesn't poison the whole document.
19
+ module Encoding
20
+ META_CHARSET_RE = /<meta[^>]+charset\s*=\s*["']?([A-Za-z0-9_\-:]+)/i.freeze
21
+ META_HTTP_EQUIV_RE = /<meta[^>]+http-equiv\s*=\s*["']?content-type["']?[^>]+content\s*=\s*["'][^"'>]*charset=([A-Za-z0-9_\-:]+)/i.freeze
22
+ SNIFF_BYTES = 1024
23
+
24
+ def self.detect(bytes)
25
+ return "UTF-8" if bytes.nil? || bytes.empty?
26
+ head = (bytes.byteslice(0, 4) || "").dup.force_encoding(::Encoding::ASCII_8BIT)
27
+ return "UTF-8" if head.start_with?("\xEF\xBB\xBF".b)
28
+ return "UTF-32LE" if head.bytesize >= 4 && head.start_with?("\xFF\xFE\x00\x00".b)
29
+ return "UTF-32BE" if head.bytesize >= 4 && head.start_with?("\x00\x00\xFE\xFF".b)
30
+ return "UTF-16LE" if head.bytesize >= 2 && head.byteslice(0, 2) == "\xFF\xFE".b
31
+ return "UTF-16BE" if head.bytesize >= 2 && head.byteslice(0, 2) == "\xFE\xFF".b
32
+ prefix = (bytes.byteslice(0, SNIFF_BYTES) || "").dup.force_encoding(::Encoding::ASCII_8BIT)
33
+ if (m = prefix.match(META_CHARSET_RE))
34
+ return normalize(m[1])
35
+ end
36
+ if (m = prefix.match(META_HTTP_EQUIV_RE))
37
+ return normalize(m[1])
38
+ end
39
+ "UTF-8"
40
+ end
41
+
42
+ def self.normalize(name)
43
+ n = name.to_s.upcase.gsub(/[^A-Z0-9]/, "")
44
+ case n
45
+ when "UTF8", "UTF8N" then "UTF-8"
46
+ when "LATIN1", "ISO88591", "WINDOWS1252", "WIN1252", "CP1252"
47
+ "WINDOWS-1252"
48
+ when "SHIFTJIS", "SJIS" then "Shift_JIS"
49
+ when "EUCJP" then "EUC-JP"
50
+ when "GBK", "GB2312", "CP936" then "GBK"
51
+ when "BIG5" then "Big5"
52
+ when "UTF16", "UTF16LE" then "UTF-16LE"
53
+ when "UTF16BE" then "UTF-16BE"
54
+ when "USASCII", "ASCII" then "US-ASCII"
55
+ else name.to_s.upcase
56
+ end
57
+ end
58
+
59
+ # Best-effort transcode of `bytes` to a UTF-8 String. Strips a leading
60
+ # BOM. Never raises — invalid sequences become "" (dropped).
61
+ BOM_UTF8 = "\xEF\xBB\xBF".b.freeze
62
+
63
+ def self.to_utf8(bytes)
64
+ s = bytes.is_a?(String) ? bytes.dup : bytes.to_s
65
+ enc = detect(s)
66
+ s.force_encoding(::Encoding::ASCII_8BIT)
67
+ # Strip UTF-8 BOM if present
68
+ if s.bytesize >= 3 && s.byteslice(0, 3) == BOM_UTF8
69
+ s = s.byteslice(3, s.bytesize - 3) || ""
70
+ end
71
+ if enc.casecmp("UTF-8").zero?
72
+ s.force_encoding(::Encoding::UTF_8)
73
+ return s if s.valid_encoding?
74
+ return s.encode(::Encoding::UTF_8, ::Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "")
75
+ end
76
+ begin
77
+ s.force_encoding(enc)
78
+ s.encode(::Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "")
79
+ rescue ::Encoding::ConverterNotFoundError, ArgumentError
80
+ s.force_encoding(::Encoding::UTF_8)
81
+ s.encode(::Encoding::UTF_8, ::Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "")
82
+ end
83
+ end
84
+ end
85
+ end