readability-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,742 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Readability
4
+ module Cleaner
5
+ private
6
+
7
+ # Port of _setNodeTag (JS line 762)
8
+ # In Nokogiri, we can mutate the tag name in place.
9
+ def set_node_tag(node, tag)
10
+ node.name = tag.downcase
11
+ node
12
+ end
13
+
14
+ # Port of _prepDocument (JS line 669)
15
+ def prep_document
16
+ # Remove all style tags
17
+ remove_nodes(get_all_nodes_with_tag(@doc, ["style"]))
18
+
19
+ # Remove HTML comments — they interfere with phrasing content wrapping
20
+ # and are not present in the JS test expected output
21
+ @doc.traverse { |node| node.unlink if node.comment? }
22
+
23
+ body = @doc.at_css("body")
24
+ replace_brs(body) if body
25
+
26
+ replace_node_tags(get_all_nodes_with_tag(@doc, ["font"]), "span")
27
+ end
28
+
29
+ # Port of _replaceBrs (JS line 706)
30
+ # Replace 2+ consecutive <br> elements with a <p>, collecting
31
+ # following phrasing content as children.
32
+ def replace_brs(elem)
33
+ get_all_nodes_with_tag(elem, ["br"]).each do |br|
34
+ next_sib = br.next_sibling
35
+
36
+ # Whether 2 or more <br> elements have been found and replaced with a <p>
37
+ replaced = false
38
+
39
+ # If we find a <br> chain, remove the <br>s until we hit another node
40
+ # or non-whitespace. This leaves behind the first <br> in the chain.
41
+ nxt = next_node(next_sib)
42
+ while nxt && nxt.name == "br"
43
+ replaced = true
44
+ br_sibling = nxt.next_sibling
45
+ nxt.unlink
46
+ nxt = next_node(br_sibling)
47
+ end
48
+
49
+ # If we removed a <br> chain, replace the remaining <br> with a <p>.
50
+ if replaced
51
+ p_node = Nokogiri::XML::Node.new("p", @doc)
52
+ br.replace(p_node)
53
+
54
+ nxt = p_node.next_sibling
55
+ while nxt
56
+ # If we've hit another <br><br>, we're done adding children to this <p>.
57
+ if nxt.name == "br"
58
+ next_elem = next_node(nxt.next_sibling)
59
+ if next_elem && next_elem.name == "br"
60
+ break
61
+ end
62
+ end
63
+
64
+ break unless is_phrasing_content?(nxt)
65
+
66
+ # Otherwise, make this node a child of the new <p>.
67
+ sibling = nxt.next_sibling
68
+ p_node.add_child(nxt)
69
+ nxt = sibling
70
+ end
71
+
72
+ # Trim trailing whitespace nodes from <p>
73
+ while p_node.children.last && is_whitespace?(p_node.children.last)
74
+ p_node.children.last.unlink
75
+ end
76
+
77
+ if p_node.parent && p_node.parent.name == "p"
78
+ set_node_tag(p_node.parent, "div")
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ # Port of _removeScripts (JS line 2001)
85
+ def remove_scripts(doc)
86
+ remove_nodes(get_all_nodes_with_tag(doc, ["script", "noscript"]))
87
+ end
88
+
89
+ # Port of _unwrapNoscriptImages (JS line 1918-1993)
90
+ def unwrap_noscript_images(doc)
91
+ # First pass: remove <img> elements without meaningful src/srcset/data-src/data-srcset
92
+ # and no attribute value matching image extensions.
93
+ imgs = doc.css("img").to_a
94
+ imgs.each do |img|
95
+ has_source = false
96
+ img.attributes.each_value do |attr|
97
+ case attr.name
98
+ when "src", "srcset", "data-src", "data-srcset"
99
+ has_source = true
100
+ break
101
+ end
102
+
103
+ if /\.(jpg|jpeg|png|webp)/i.match?(attr.value)
104
+ has_source = true
105
+ break
106
+ end
107
+ end
108
+
109
+ img.unlink unless has_source
110
+ end
111
+
112
+ # Second pass: for each <noscript> that contains a single image,
113
+ # if its previous sibling is also a single image, replace it.
114
+ noscripts = doc.css("noscript").to_a
115
+ noscripts.each do |noscript|
116
+ next unless is_single_image?(noscript)
117
+
118
+ tmp = Nokogiri::HTML::DocumentFragment.parse(noscript.inner_html)
119
+
120
+ prev_element = noscript.previous_element
121
+ next unless prev_element && is_single_image?(prev_element)
122
+
123
+ prev_img = prev_element
124
+ prev_img = prev_element.at_css("img") unless prev_img.name == "img"
125
+
126
+ new_img = tmp.at_css("img")
127
+ next unless new_img
128
+
129
+ prev_img.attributes.each_value do |attr|
130
+ next if attr.value.empty?
131
+
132
+ if attr.name == "src" || attr.name == "srcset" || /\.(jpg|jpeg|png|webp)/i.match?(attr.value)
133
+ next if new_img[attr.name] == attr.value
134
+
135
+ attr_name = attr.name
136
+ if new_img[attr_name]
137
+ attr_name = "data-old-#{attr_name}"
138
+ end
139
+
140
+ new_img[attr_name] = attr.value
141
+ end
142
+ end
143
+
144
+ prev_element.replace(tmp.element_children.first)
145
+ end
146
+ end
147
+
148
+ # Port of _prepArticle (JS line 792)
149
+ def prep_article(article_content)
150
+ clean_styles(article_content)
151
+
152
+ # Check for data tables before we continue
153
+ mark_data_tables(article_content)
154
+
155
+ fix_lazy_images(article_content)
156
+
157
+ # Clean out junk from the article content
158
+ clean_conditionally(article_content, "form")
159
+ clean_conditionally(article_content, "fieldset")
160
+ clean(article_content, "object")
161
+ clean(article_content, "embed")
162
+ clean(article_content, "footer")
163
+ clean(article_content, "link")
164
+ clean(article_content, "aside")
165
+
166
+ # Clean out elements with little content that have "share" in
167
+ # their id/class combinations from final top candidates
168
+ share_element_threshold = DEFAULT_CHAR_THRESHOLD
169
+
170
+ article_content.element_children.each do |top_candidate|
171
+ clean_matched_nodes(top_candidate) do |node, match_string|
172
+ SHARE_ELEMENTS.match?(match_string) &&
173
+ node.text.length < share_element_threshold
174
+ end
175
+ end
176
+
177
+ clean(article_content, "iframe")
178
+ clean(article_content, "input")
179
+ clean(article_content, "textarea")
180
+ clean(article_content, "select")
181
+ clean(article_content, "button")
182
+ clean_headers(article_content)
183
+
184
+ # Do these last as the previous stuff may have removed junk
185
+ # that will affect these
186
+ clean_conditionally(article_content, "table")
187
+ clean_conditionally(article_content, "ul")
188
+ clean_conditionally(article_content, "div")
189
+
190
+ # Replace H1 with H2 as H1 should be only title that is displayed separately
191
+ replace_node_tags(get_all_nodes_with_tag(article_content, ["h1"]), "h2")
192
+
193
+ # Remove extra paragraphs
194
+ remove_nodes(get_all_nodes_with_tag(article_content, ["p"])) do |paragraph|
195
+ content_element_count = get_all_nodes_with_tag(paragraph, ["img", "embed", "object", "iframe"]).length
196
+ content_element_count == 0 && get_inner_text(paragraph, false).empty?
197
+ end
198
+
199
+ # Remove br before p
200
+ get_all_nodes_with_tag(article_content, ["br"]).each do |br|
201
+ nxt = next_node(br.next_sibling)
202
+ br.unlink if nxt && nxt.name == "p"
203
+ end
204
+
205
+ # Remove single-cell tables
206
+ get_all_nodes_with_tag(article_content, ["table"]).to_a.each do |table|
207
+ tbody = has_single_tag_inside_element?(table, "tbody") ? table.element_children.first : table
208
+ if has_single_tag_inside_element?(tbody, "tr")
209
+ row = tbody.element_children.first
210
+ if has_single_tag_inside_element?(row, "td")
211
+ cell = row.element_children.first
212
+ new_tag = cell.children.all? { |child| is_phrasing_content?(child) } ? "p" : "div"
213
+ cell = set_node_tag(cell, new_tag)
214
+ table.replace(cell)
215
+ end
216
+ end
217
+ end
218
+ end
219
+
220
+ # Port of _cleanStyles (JS line 2114)
221
+ def clean_styles(elem)
222
+ return if !elem || elem.name == "svg"
223
+
224
+ # Remove presentational attributes
225
+ PRESENTATIONAL_ATTRIBUTES.each do |attr|
226
+ elem.remove_attribute(attr)
227
+ end
228
+
229
+ if DEPRECATED_SIZE_ATTRIBUTE_ELEMS.include?(elem.name)
230
+ elem.remove_attribute("width")
231
+ elem.remove_attribute("height")
232
+ end
233
+
234
+ cur = elem.element_children.first
235
+ while cur
236
+ clean_styles(cur)
237
+ cur = cur.next_element
238
+ end
239
+ end
240
+
241
+ # Port of _markDataTables (JS line 2297-2354)
242
+ def mark_data_tables(root)
243
+ tables = root.css("table")
244
+ tables.each do |table|
245
+ role = table["role"]
246
+ if role == "presentation"
247
+ # NOT a data table
248
+ next
249
+ end
250
+
251
+ datatable = table["datatable"]
252
+ if datatable == "0"
253
+ next
254
+ end
255
+
256
+ if table["summary"] && !table["summary"].empty?
257
+ @data_tables.add(table)
258
+ next
259
+ end
260
+
261
+ caption = table.at_css("caption")
262
+ if caption && caption.children.length > 0
263
+ @data_tables.add(table)
264
+ next
265
+ end
266
+
267
+ # If the table has a descendant with any of these tags, consider a data table
268
+ data_table_descendants = %w[col colgroup tfoot thead th]
269
+ if data_table_descendants.any? { |tag| table.at_css(tag) }
270
+ @data_tables.add(table)
271
+ next
272
+ end
273
+
274
+ # Nested tables indicate a layout table
275
+ if table.at_css("table")
276
+ next
277
+ end
278
+
279
+ size_info = get_row_and_column_count(table)
280
+
281
+ if size_info[:columns] == 1 || size_info[:rows] == 1
282
+ next
283
+ end
284
+
285
+ if size_info[:rows] >= 10 || size_info[:columns] > 4
286
+ @data_tables.add(table)
287
+ next
288
+ end
289
+
290
+ # Now just go by size entirely
291
+ @data_tables.add(table) if size_info[:rows] * size_info[:columns] > 10
292
+ end
293
+ end
294
+
295
+ # Port of _getRowAndColumnCount (JS line 2266)
296
+ def get_row_and_column_count(table)
297
+ rows = 0
298
+ columns = 0
299
+ trs = table.css("tr")
300
+ trs.each do |tr|
301
+ rowspan = (tr["rowspan"] || 0).to_i
302
+ rows += (rowspan > 0 ? rowspan : 1)
303
+
304
+ columns_in_this_row = 0
305
+ cells = tr.css("td")
306
+ cells.each do |cell|
307
+ colspan = (cell["colspan"] || 0).to_i
308
+ columns_in_this_row += (colspan > 0 ? colspan : 1)
309
+ end
310
+ columns = [columns, columns_in_this_row].max
311
+ end
312
+ { rows: rows, columns: columns }
313
+ end
314
+
315
+ # Port of _fixLazyImages (JS line 2358)
316
+ def fix_lazy_images(root)
317
+ get_all_nodes_with_tag(root, ["img", "picture", "figure"]).each do |elem|
318
+ src = elem["src"]
319
+
320
+ # Check for base64 placeholder images
321
+ if src && B64_DATA_URL.match?(src)
322
+ parts = B64_DATA_URL.match(src)
323
+ # Skip SVG - can have meaningful image in under 133 bytes
324
+ next if parts[1] == "image/svg+xml"
325
+
326
+ # Check if other attributes contain image references
327
+ src_could_be_removed = false
328
+ elem.attributes.each_value do |attr|
329
+ next if attr.name == "src"
330
+
331
+ if /\.(jpg|jpeg|png|webp)/i.match?(attr.value)
332
+ src_could_be_removed = true
333
+ break
334
+ end
335
+ end
336
+
337
+ # If image is less than 133 bytes in base64 it's likely a placeholder
338
+ if src_could_be_removed
339
+ b64starts = parts[0].length
340
+ b64length = src.length - b64starts
341
+ elem.remove_attribute("src") if b64length < 133
342
+ end
343
+ end
344
+
345
+ # Also check for "null" to work around jsdom issues.
346
+ # Note: In JS, empty string is falsy, so `elem.src = ""` does NOT
347
+ # prevent lazy-image processing. We must mirror that by treating
348
+ # empty-string src/srcset the same as absent.
349
+ elem_src = elem["src"]
350
+ elem_srcset = elem["srcset"]
351
+ if (elem_src && !elem_src.empty? || (elem_srcset && elem_srcset != "null" && !elem_srcset.empty?)) &&
352
+ !(elem["class"] || "").downcase.include?("lazy")
353
+ next
354
+ end
355
+
356
+ elem.attributes.each_value do |attr|
357
+ next if %w[src srcset alt].include?(attr.name)
358
+
359
+ copy_to = nil
360
+ if /\.(jpg|jpeg|png|webp)\s+\d/.match?(attr.value)
361
+ copy_to = "srcset"
362
+ elsif /\A\s*\S+\.(jpg|jpeg|png|webp)\S*\s*\z/.match?(attr.value)
363
+ copy_to = "src"
364
+ end
365
+
366
+ if copy_to
367
+ if elem.name == "img" || elem.name == "picture"
368
+ elem[copy_to] = attr.value
369
+ elsif elem.name == "figure" && get_all_nodes_with_tag(elem, ["img", "picture"]).empty?
370
+ img = Nokogiri::XML::Node.new("img", @doc)
371
+ img[copy_to] = attr.value
372
+ elem.add_child(img)
373
+ end
374
+ end
375
+ end
376
+ end
377
+ end
378
+
379
+ # Port of _clean (JS line 2208)
380
+ def clean(elem, tag)
381
+ is_embed = %w[object embed iframe].include?(tag)
382
+
383
+ remove_nodes(get_all_nodes_with_tag(elem, [tag])) do |element|
384
+ # Allow youtube and vimeo videos through
385
+ if is_embed
386
+ # Check attributes for allowed video URLs
387
+ keep = false
388
+ element.attributes.each_value do |attr|
389
+ if @allowed_video_regex.match?(attr.value)
390
+ keep = true
391
+ break
392
+ end
393
+ end
394
+ next false if keep
395
+
396
+ # For embed with <object> tag, check inner HTML as well
397
+ if element.name == "object" && @allowed_video_regex.match?(element.inner_html)
398
+ next false
399
+ end
400
+ end
401
+
402
+ true
403
+ end
404
+ end
405
+
406
+ # Port of _cleanConditionally (JS line 2460-2657)
407
+ def clean_conditionally(elem, tag)
408
+ return unless flag_is_active?(FLAG_CLEAN_CONDITIONALLY)
409
+
410
+ is_data_table = ->(t) { @data_tables.include?(t) }
411
+
412
+ remove_nodes(get_all_nodes_with_tag(elem, [tag])) do |node|
413
+ is_list = (tag == "ul" || tag == "ol")
414
+
415
+ unless is_list
416
+ list_length = 0
417
+ get_all_nodes_with_tag(node, ["ul", "ol"]).each do |list|
418
+ list_length += get_inner_text(list).length
419
+ end
420
+ node_text_length = get_inner_text(node).length
421
+ is_list = node_text_length > 0 && list_length.to_f / node_text_length > 0.9
422
+ end
423
+
424
+ # First check if this node IS a data table
425
+ if tag == "table" && is_data_table.call(node)
426
+ next false
427
+ end
428
+
429
+ # Next check if we're inside a data table
430
+ if has_ancestor_tag?(node, "table", -1, &is_data_table)
431
+ next false
432
+ end
433
+
434
+ if has_ancestor_tag?(node, "code")
435
+ next false
436
+ end
437
+
438
+ # Keep element if it contains a data table
439
+ if node.css("table").any? { |tbl| @data_tables.include?(tbl) }
440
+ next false
441
+ end
442
+
443
+ weight = get_class_weight(node)
444
+ content_score = 0
445
+
446
+ if weight + content_score < 0
447
+ next true
448
+ end
449
+
450
+ if get_char_count(node, ",") < 10
451
+ p_count = node.css("p").length
452
+ img_count = node.css("img").length
453
+ li_count = node.css("li").length - 100
454
+ input_count = node.css("input").length
455
+ heading_density = get_text_density(node, ["h1", "h2", "h3", "h4", "h5", "h6"])
456
+
457
+ embed_count = 0
458
+ embeds = get_all_nodes_with_tag(node, ["object", "embed", "iframe"])
459
+
460
+ skip_removal = false
461
+ embeds.each do |embed_node|
462
+ # Check attributes for allowed video URLs
463
+ embed_node.attributes.each_value do |attr|
464
+ if @allowed_video_regex.match?(attr.value)
465
+ skip_removal = true
466
+ break
467
+ end
468
+ end
469
+ break if skip_removal
470
+
471
+ if embed_node.name == "object" && @allowed_video_regex.match?(embed_node.inner_html)
472
+ skip_removal = true
473
+ break
474
+ end
475
+
476
+ embed_count += 1
477
+ end
478
+ next false if skip_removal
479
+
480
+ inner_text = get_inner_text(node)
481
+
482
+ # Toss any node whose inner text contains nothing but suspicious words
483
+ if AD_WORDS.match?(inner_text) || LOADING_WORDS.match?(inner_text)
484
+ next true
485
+ end
486
+
487
+ content_length = inner_text.length
488
+ link_density = get_link_density(node)
489
+ textish_tags = %w[span li td] + DIV_TO_P_ELEMS.to_a
490
+ text_density = get_text_density(node, textish_tags)
491
+ is_figure_child = has_ancestor_tag?(node, "figure")
492
+
493
+ # Apply shadiness checks
494
+ have_to_remove = false
495
+ errs = []
496
+
497
+ if !is_figure_child && img_count > 1 && p_count.to_f / img_count < 0.5
498
+ errs << "Bad p to img ratio"
499
+ end
500
+ if !is_list && li_count > p_count
501
+ errs << "Too many li's outside of a list"
502
+ end
503
+ if input_count > (p_count / 3).floor
504
+ errs << "Too many inputs per p"
505
+ end
506
+ if !is_list && !is_figure_child && heading_density < 0.9 &&
507
+ content_length < 25 && (img_count == 0 || img_count > 2) && link_density > 0
508
+ errs << "Suspiciously short"
509
+ end
510
+ if !is_list && weight < 25 && link_density > 0.2 + @link_density_modifier
511
+ errs << "Low weight and a little linky"
512
+ end
513
+ if weight >= 25 && link_density > 0.5 + @link_density_modifier
514
+ errs << "High weight and mostly links"
515
+ end
516
+ if (embed_count == 1 && content_length < 75) || embed_count > 1
517
+ errs << "Suspicious embed"
518
+ end
519
+ if img_count == 0 && text_density == 0
520
+ errs << "No useful content"
521
+ end
522
+
523
+ have_to_remove = errs.any?
524
+
525
+ # Allow simple lists of images to remain
526
+ if is_list && have_to_remove
527
+ all_single_child = true
528
+ node.element_children.each do |child|
529
+ if child.element_children.length > 1
530
+ all_single_child = false
531
+ break
532
+ end
533
+ end
534
+
535
+ if all_single_child
536
+ li_total = node.css("li").length
537
+ have_to_remove = false if img_count == li_total
538
+ end
539
+ end
540
+
541
+ next have_to_remove
542
+ end
543
+
544
+ false
545
+ end
546
+ end
547
+
548
+ # Port of _cleanMatchedNodes (JS line 2667)
549
+ def clean_matched_nodes(elem, &filter)
550
+ end_of_search_marker = get_next_node(elem, true)
551
+ nxt = get_next_node(elem)
552
+ while nxt && nxt != end_of_search_marker
553
+ match_string = "#{nxt["class"] || ""} #{nxt["id"] || ""}"
554
+ if filter.call(nxt, match_string)
555
+ nxt = remove_and_get_next(nxt)
556
+ else
557
+ nxt = get_next_node(nxt)
558
+ end
559
+ end
560
+ end
561
+
562
+ # Port of _cleanHeaders (JS line 2685)
563
+ def clean_headers(elem)
564
+ heading_nodes = get_all_nodes_with_tag(elem, ["h1", "h2"])
565
+ remove_nodes(heading_nodes) do |node|
566
+ get_class_weight(node) < 0
567
+ end
568
+ end
569
+
570
+ # Port of _headerDuplicatesTitle (JS line 2703)
571
+ def header_duplicates_title?(node)
572
+ return false unless node.name == "h1" || node.name == "h2"
573
+
574
+ heading = get_inner_text(node, false)
575
+ text_similarity(@article_title || "", heading) > 0.75
576
+ end
577
+
578
+ # Port of _postProcessContent (JS line 282)
579
+ def post_process_content(article_content)
580
+ fix_relative_uris(article_content)
581
+ simplify_nested_elements(article_content)
582
+
583
+ clean_classes(article_content) unless @keep_classes
584
+ end
585
+
586
+ # Port of _fixRelativeUris (JS line 457-536)
587
+ def fix_relative_uris(article_content)
588
+ document_uri = @url
589
+ return unless document_uri
590
+
591
+ # Compute the effective base URI, considering <base> elements (like JS document.baseURI)
592
+ base_uri = document_uri
593
+ base_element = @doc.at_css("base[href]")
594
+ if base_element
595
+ base_href = base_element["href"]
596
+ if base_href && !base_href.empty?
597
+ begin
598
+ base_uri = URI.join(document_uri, base_href).to_s
599
+ rescue URI::InvalidURIError, URI::InvalidComponentError, URI::BadURIError
600
+ # keep document_uri as base
601
+ end
602
+ end
603
+ end
604
+
605
+ to_absolute_uri = lambda do |uri|
606
+ # Strip whitespace — Nokogiri preserves newlines in attributes,
607
+ # but JS DOM normalizes them
608
+ uri = uri.strip
609
+
610
+ # Leave hash links alone if base URI matches document URI
611
+ return uri if base_uri == document_uri && uri.start_with?("#")
612
+
613
+ # Quick check for non-HTTP scheme URIs before parsing — return as-is
614
+ # (with file: URL normalization for Windows drive letters)
615
+ if uri.match?(/\A[a-z][a-z0-9+\-.]*:/i) && !uri.match?(/\Ahttps?:/i)
616
+ # Normalize Windows drive letters in file: URIs (C| -> C:) per WHATWG URL spec
617
+ if uri.match?(/\Afile:/i)
618
+ return uri.sub(%r{\A(file:///[A-Za-z])\|(/)}i, '\1:\2')
619
+ end
620
+ return uri
621
+ end
622
+
623
+ begin
624
+ resolved = URI.join(base_uri, uri)
625
+ # Match JS URL normalization
626
+ if resolved.is_a?(URI::HTTP)
627
+ # Add trailing slash for scheme-based URLs with empty path
628
+ if resolved.path.nil? || resolved.path.empty?
629
+ resolved.path = "/"
630
+ end
631
+ # Lowercase hostname (JS new URL() does this per WHATWG URL spec)
632
+ resolved.host = resolved.host.downcase if resolved.host
633
+ end
634
+ resolved.to_s
635
+ rescue URI::InvalidURIError, URI::InvalidComponentError, URI::BadURIError
636
+ # URI.join failed — try manual resolution as a relative path
637
+ begin
638
+ base = URI.parse(base_uri)
639
+ # Remove filename from base path to get directory
640
+ base_dir = base.path.sub(%r{/[^/]*\z}, "/")
641
+ base.path = base_dir + uri
642
+ base.to_s
643
+ rescue
644
+ uri
645
+ end
646
+ end
647
+ end
648
+
649
+ # Fix anchor tags
650
+ get_all_nodes_with_tag(article_content, ["a"]).to_a.each do |link|
651
+ href = link["href"]
652
+ next unless href
653
+
654
+ if href.strip.start_with?("javascript:")
655
+ # Replace javascript: links
656
+ if link.children.length == 1 && link.children[0].text?
657
+ text_node = Nokogiri::XML::Text.new(link.text, @doc)
658
+ link.replace(text_node)
659
+ else
660
+ container = Nokogiri::XML::Node.new("span", @doc)
661
+ while link.children.first
662
+ container.add_child(link.children.first)
663
+ end
664
+ link.replace(container)
665
+ end
666
+ else
667
+ link["href"] = to_absolute_uri.call(href)
668
+ end
669
+ end
670
+
671
+ # Fix media tags
672
+ media_tags = %w[img picture figure video audio source]
673
+ get_all_nodes_with_tag(article_content, media_tags).each do |media|
674
+ src = media["src"]
675
+ poster = media["poster"]
676
+ srcset = media["srcset"]
677
+
678
+ media["src"] = to_absolute_uri.call(src) if src
679
+ media["poster"] = to_absolute_uri.call(poster) if poster
680
+
681
+ if srcset
682
+ new_srcset = srcset.gsub(SRCSET_URL) do
683
+ p1 = Regexp.last_match(1)
684
+ p2 = Regexp.last_match(2) || ""
685
+ p3 = Regexp.last_match(3)
686
+ "#{to_absolute_uri.call(p1)}#{p2}#{p3}"
687
+ end
688
+ media["srcset"] = new_srcset
689
+ end
690
+ end
691
+ end
692
+
693
+ # Port of _simplifyNestedElements (JS line 538-566)
694
+ def simplify_nested_elements(article_content)
695
+ node = article_content
696
+
697
+ while node
698
+ if node.parent &&
699
+ %w[div section].include?(node.name) &&
700
+ !(node["id"] && node["id"].start_with?("readability"))
701
+
702
+ if is_element_without_content?(node)
703
+ node = remove_and_get_next(node)
704
+ next
705
+ elsif has_single_tag_inside_element?(node, "div") ||
706
+ has_single_tag_inside_element?(node, "section")
707
+ child = node.element_children[0]
708
+ # Copy attributes from parent to child
709
+ node.attributes.each_value do |attr|
710
+ child[attr.name] = attr.value
711
+ end
712
+ node.replace(child)
713
+ node = child
714
+ next
715
+ end
716
+ end
717
+
718
+ node = get_next_node(node)
719
+ end
720
+ end
721
+
722
+ # Port of _cleanClasses (JS line 418)
723
+ def clean_classes(node)
724
+ class_name = (node["class"] || "")
725
+ .split(/\s+/)
726
+ .select { |cls| @classes_to_preserve.include?(cls) }
727
+ .join(" ")
728
+
729
+ if class_name.empty?
730
+ node.remove_attribute("class")
731
+ else
732
+ node["class"] = class_name
733
+ end
734
+
735
+ child = node.element_children.first
736
+ while child
737
+ clean_classes(child)
738
+ child = child.next_element
739
+ end
740
+ end
741
+ end
742
+ end