readability-rb 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bcab6fa16fd851068954a9fa17a234e29f23f7406e094bc6f265954c29ce231a
4
- data.tar.gz: c2866940fbd118a73065bd51de75491e9a02284ca0287953a53e8841d117029d
3
+ metadata.gz: b7c8a5adc3a628af9f665a4a90129612b5be891797f0b90681ed96be5566a9fb
4
+ data.tar.gz: 76eb683b6a38605b637cbcbda8c25b96ef3060f7f8d277acfd3f4885537a6708
5
5
  SHA512:
6
- metadata.gz: 4ce2c23e7ddc3321dabd4bd45762c8c7950ceb4e3fb2bf1f48852c66937d0715f98027a0233b8e6f2a09628fc28f078eba8e68b2336c7f219890af3ddc9daae3
7
- data.tar.gz: 86b0254a34654db1e910b3b8b0640fca6f77a1f518d29bd9816f49035e9308f492fb747be8e6b82257bdae7a2b184a7e13d469b91ed83e9fd1acdcacdbed3403
6
+ metadata.gz: 94f81c5b0502338b55ed611a6c289dfbd8250b1c058df5b45dff87bd126bcff2038518e85da4dfff0f6baea366f4cdb76305e258936862d063e1f964e621e1bf
7
+ data.tar.gz: ae056bc80680ad1416829e7454f0045c49ac7978450269ea2a5ee29955a7b84a11511100fe4ae2283abba835d4449be3fc22caf037e51fb9cfad4580cc61b5ef
@@ -447,11 +447,15 @@ module Readability
447
447
  next true
448
448
  end
449
449
 
450
- if get_char_count(node, ",") < 10
451
- p_count = node.css("p").length
452
- img_count = node.css("img").length
453
- li_count = node.css("li").length - 100
454
- input_count = node.css("input").length
450
+ inner_text = get_inner_text(node)
451
+
452
+ if inner_text.split(COMMAS).length - 1 < 10
453
+ tag_counts = Hash.new(0)
454
+ node.css("p, img, li, input").each { |n| tag_counts[n.name] += 1 }
455
+ p_count = tag_counts["p"]
456
+ img_count = tag_counts["img"]
457
+ li_count = tag_counts["li"] - 100
458
+ input_count = tag_counts["input"]
455
459
  heading_density = get_text_density(node, ["h1", "h2", "h3", "h4", "h5", "h6"])
456
460
 
457
461
  embed_count = 0
@@ -477,15 +481,13 @@ module Readability
477
481
  end
478
482
  next false if skip_removal
479
483
 
480
- inner_text = get_inner_text(node)
481
-
482
484
  # Toss any node whose inner text contains nothing but suspicious words
483
485
  if AD_WORDS.match?(inner_text) || LOADING_WORDS.match?(inner_text)
484
486
  next true
485
487
  end
486
488
 
487
489
  content_length = inner_text.length
488
- link_density = get_link_density(node)
490
+ link_density = get_link_density(node, text_length: content_length)
489
491
  textish_tags = %w[span li td] + DIV_TO_P_ELEMS.to_a
490
492
  text_density = get_text_density(node, textish_tags)
491
493
  is_figure_child = has_ancestor_tag?(node, "figure")
@@ -42,9 +42,14 @@ module Readability
42
42
  def parse
43
43
  # Avoid parsing too large documents
44
44
  if @max_elems_to_parse > 0
45
- num_tags = @doc.css("*").length
46
- if num_tags > @max_elems_to_parse
47
- raise "Aborting parsing document; #{num_tags} elements found"
45
+ count = 0
46
+ @doc.traverse do |n|
47
+ if n.element?
48
+ count += 1
49
+ if count > @max_elems_to_parse
50
+ raise "Aborting parsing document; #{count} elements found"
51
+ end
52
+ end
48
53
  end
49
54
  end
50
55
 
@@ -59,6 +64,9 @@ module Readability
59
64
 
60
65
  prep_document
61
66
 
67
+ # Cache the prepped body HTML for retry re-parsing (avoids innerHTML= cost)
68
+ @prepped_body_html = @doc.at_css("body")&.inner_html
69
+
62
70
  metadata = get_article_metadata(json_ld)
63
71
  @metadata = metadata
64
72
  @article_title = metadata["title"]
@@ -109,7 +117,9 @@ module Readability
109
117
  return nil
110
118
  end
111
119
 
112
- page_cache_html = page.inner_html
120
+ # Preserve the lang attribute from the HTML element before any retry re-parsing
121
+ preserved_article_lang = @doc.root && @doc.root["lang"]
122
+ preserved_article_dir = @doc.root && @doc.root["dir"]
113
123
 
114
124
  while true
115
125
  log("Starting grabArticle loop")
@@ -497,10 +507,10 @@ module Readability
497
507
  text_length = get_inner_text(article_content, true).length
498
508
  if text_length < @char_threshold
499
509
  parse_successful = false
500
- page.inner_html = page_cache_html
501
510
 
511
+ # Store serialized HTML instead of node references to avoid pinning old documents
502
512
  @attempts << {
503
- article_content: article_content,
513
+ html: article_content.inner_html,
504
514
  text_length: text_length
505
515
  }
506
516
 
@@ -517,9 +527,30 @@ module Readability
517
527
  # But first check if we actually have something
518
528
  return nil if @attempts[0][:text_length] == 0
519
529
 
520
- article_content = @attempts[0][:article_content]
530
+ # Re-parse the best attempt from serialized HTML
531
+ best_doc = Nokogiri::HTML5("<html><body>#{@attempts[0][:html]}</body></html>")
532
+ best_doc.root["lang"] = preserved_article_lang if preserved_article_lang
533
+ best_doc.root["dir"] = preserved_article_dir if preserved_article_dir
534
+ article_content = best_doc.at_css("body")
535
+ @doc = best_doc
521
536
  parse_successful = true
522
537
  end
538
+
539
+ unless parse_successful
540
+ # Create a fresh document from the prepped body HTML, allowing the old one to be GC'd
541
+ @doc = Nokogiri::HTML5("<html><head></head><body>#{@prepped_body_html}</body></html>")
542
+ # Restore the lang attribute on the new HTML element so it's picked up during traversal
543
+ @doc.root["lang"] = preserved_article_lang if preserved_article_lang
544
+ @doc.root["dir"] = preserved_article_dir if preserved_article_dir
545
+ page = @doc.at_css("body")
546
+
547
+ # Clear node-referencing instance variables since they point to the old document
548
+ @candidates = {}
549
+ @data_tables = Set.new
550
+ @article_byline = nil
551
+ @article_dir = nil
552
+ @article_lang = preserved_article_lang
553
+ end
523
554
  end
524
555
 
525
556
  if parse_successful
@@ -50,8 +50,8 @@ module Readability
50
50
  # Port of _getLinkDensity (JS line 2143)
51
51
  # Returns the ratio of anchor text length to total text length.
52
52
  # Fragment-only links (#...) count at 0.3 coefficient.
53
- def get_link_density(element)
54
- text_length = get_inner_text(element).length
53
+ def get_link_density(element, text_length: nil)
54
+ text_length ||= get_inner_text(element).length
55
55
  return 0 if text_length == 0
56
56
 
57
57
  link_length = 0.0
@@ -76,7 +76,7 @@ module Readability
76
76
  js_trim(node.text).empty? &&
77
77
  (node.element_children.empty? ||
78
78
  node.element_children.length ==
79
- node.css("br").length + node.css("hr").length)
79
+ node.css("br, hr").length)
80
80
  end
81
81
 
82
82
  # Port of _hasChildBlockElement (JS line 2044)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Readability
4
- VERSION = "0.1.0"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: readability-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andy Croll
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2026-04-13 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: nokogiri
@@ -52,7 +51,6 @@ metadata:
52
51
  source_code_uri: https://github.com/andycroll/readability-rb
53
52
  changelog_uri: https://github.com/andycroll/readability-rb/commits/main
54
53
  bug_tracker_uri: https://github.com/andycroll/readability-rb/issues
55
- post_install_message:
56
54
  rdoc_options: []
57
55
  require_paths:
58
56
  - lib
@@ -60,15 +58,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
60
58
  requirements:
61
59
  - - ">="
62
60
  - !ruby/object:Gem::Version
63
- version: '3.1'
61
+ version: '3.2'
64
62
  required_rubygems_version: !ruby/object:Gem::Requirement
65
63
  requirements:
66
64
  - - ">="
67
65
  - !ruby/object:Gem::Version
68
66
  version: '0'
69
67
  requirements: []
70
- rubygems_version: 3.5.3
71
- signing_key:
68
+ rubygems_version: 3.6.9
72
69
  specification_version: 4
73
70
  summary: Extract readable article content from HTML pages
74
71
  test_files: []