markdownr 0.4.6 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 596b75e231efdc839c8729fe1db59a8b2933c08f12059cf753b261445d888813
4
- data.tar.gz: ddc7fc42eb7f55286e0edcb296ec816567146e1ad412ae4df4243ab31255e4ea
3
+ metadata.gz: ae4a95fd815bc7acdaf267cf6ac5b056debfa7da5812b66c254e0ef002c01da6
4
+ data.tar.gz: cb4e91f2e5aab746588702770f331e20fea2d71e586c84835417cf5d6d27ffe7
5
5
  SHA512:
6
- metadata.gz: 34a2585d6a3818cd175d40f8b9f728bee1a83b3dbf2ac238c320b8b2be9d49c895dc9fb9709edfeed149317d4a03caa1536a87075da706711ceafa61a3c789d9
7
- data.tar.gz: 256c941d8ad83a87fca9d205be075390c425c3b883dbd7fc3b2ed4f71e0f61c6cc29517a56897046c9c2a9c4507978b84445c9edeaf456e054527e9be1ed0f00
6
+ metadata.gz: 20729eb32420c42496a531229d72baf7692cd0de81eed5578f48e1a20fdf844fe4235367174c24b27e6cabbccf1223eb17dfe73c40a35e2058475334dd4fff50
7
+ data.tar.gz: b2902ec6c5bc7ae048da0c4f5f6d54cadd09003f8fdf524b59e2b96f519d4ca8b9a6e0cd30635266dfc64fa678eca09103aad5366bd96a840c995f6eaa000111
@@ -435,6 +435,17 @@ module MarkdownServer
435
435
  FETCH_MAX_BYTES = 512_000
436
436
  FETCH_TIMEOUT = 5
437
437
 
438
+ # Tags kept as-is (attributes stripped)
439
+ ALLOWED_HTML = %w[p h1 h2 h3 h4 h5 h6 blockquote ul ol li
440
+ pre br hr strong b em i sup sub code
441
+ table tr td th].to_set
442
+ # Block containers — replaced with a newline (content kept)
443
+ BLOCK_HTML = %w[div section aside figure figcaption
444
+ thead tbody tfoot].to_set
445
+ # Elements removed completely, including their content
446
+ STRIP_FULL = %w[script style nav header footer form input
447
+ button select textarea svg iframe noscript].to_set
448
+
438
449
  def fetch_external_page(url_str)
439
450
  uri = URI.parse(url_str)
440
451
  return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
@@ -480,26 +491,41 @@ module MarkdownServer
480
491
  } || ""
481
492
  end
482
493
 
483
- def page_text(html)
484
- # Strip inert elements
485
- w = html
486
- .gsub(/<script[^>]*>.*?<\/script>/im, " ")
487
- .gsub(/<style[^>]*>.*?<\/style>/im, " ")
488
- .gsub(/<nav[^>]*>.*?<\/nav>/im, " ")
489
- .gsub(/<header[^>]*>.*?<\/header>/im," ")
490
- .gsub(/<footer[^>]*>.*?<\/footer>/im," ")
491
- .gsub(/<!--.*?-->/m, " ")
492
- # Prefer <article> or <main>, fall back to <body>, then whole doc
494
+ def page_html(raw)
495
+ w = raw.dup
496
+ # Remove inert elements and their entire contents
497
+ STRIP_FULL.each { |t| w.gsub!(/<#{t}[^>]*>.*?<\/#{t}>/im, " ") }
498
+ w.gsub!(/<!--.*?-->/m, " ")
499
+
500
+ # Prefer a focused content block
493
501
  content = w.match(/<article[^>]*>(.*?)<\/article>/im)&.[](1) ||
494
502
  w.match(/<main[^>]*>(.*?)<\/main>/im)&.[](1) ||
495
503
  w.match(/<body[^>]*>(.*?)<\/body>/im)&.[](1) ||
496
504
  w
497
- text = content
498
- .gsub(/<[^>]+>/, " ")
499
- .gsub(/&nbsp;/i, " ").gsub(/&amp;/i, "&").gsub(/&lt;/i, "<")
500
- .gsub(/&gt;/i, ">").gsub(/&quot;/i, '"').gsub(/&#?\w+;/, " ")
501
- .gsub(/\s+/, " ").strip
502
- text.length > 3000 ? "#{text[0, 3000]}…" : text
505
+
506
+ # Rewrite tags: keep allowed (strip attrs), block→newline, rest→empty
507
+ out = content.gsub(/<(\/?)(\w+)[^>]*>/) do
508
+ slash, tag = $1, $2.downcase
509
+ if ALLOWED_HTML.include?(tag) then "<#{slash}#{tag}>"
510
+ elsif BLOCK_HTML.include?(tag) then "\n"
511
+ else ""
512
+ end
513
+ end
514
+
515
+ # Decode HTML entities
516
+ out = out
517
+ .gsub(/&nbsp;/i, " ").gsub(/&amp;/i, "&").gsub(/&lt;/i, "<").gsub(/&gt;/i, ">")
518
+ .gsub(/&quot;/i, '"').gsub(/&apos;/i, "'")
519
+ .gsub(/&mdash;/i, "—").gsub(/&ndash;/i, "–").gsub(/&hellip;/i, "…")
520
+ .gsub(/&#(\d+);/) { [$1.to_i].pack("U") rescue " " }
521
+ .gsub(/&#x([\da-f]+);/i) { [$1.to_i(16)].pack("U") rescue " " }
522
+ .gsub(/&\w+;/, " ")
523
+ .gsub(/[ \t]+/, " ")
524
+ .gsub(/\n{3,}/, "\n\n")
525
+ .gsub(/<(\w+)>\s*<\/\1>/, "") # drop empty tags
526
+ .strip
527
+
528
+ out.length > 10_000 ? out[0, 10_000] : out
503
529
  end
504
530
 
505
531
  def compile_regexes(query)
@@ -593,7 +619,7 @@ module MarkdownServer
593
619
  html = fetch_external_page(url)
594
620
  halt 502, '{"error":"fetch failed"}' unless html
595
621
 
596
- JSON.dump({ title: page_title(html), text: page_text(html) })
622
+ JSON.dump({ title: page_title(html), html: page_html(html) })
597
623
  end
598
624
 
599
625
  get "/search/?*" do
@@ -1,3 +1,3 @@
1
1
  module MarkdownServer
2
- VERSION = "0.4.6"
2
+ VERSION = "0.4.7"
3
3
  end
data/views/layout.erb CHANGED
@@ -1541,7 +1541,7 @@
1541
1541
  if (extCached && typeof extCached === 'object') {
1542
1542
  showPopup(x, y, extCached.title || label,
1543
1543
  '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1544
- '<div style="font-family:sans-serif;font-size:0.82rem;line-height:1.55;color:#444;margin-top:0.5rem">' + escHtml(extCached.text) + '</div>');
1544
+ extCached.html);
1545
1545
  } else if (extCached === false) {
1546
1546
  showPopup(x, y, label,
1547
1547
  '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
@@ -1560,10 +1560,10 @@
1560
1560
  '<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
1561
1561
  return;
1562
1562
  }
1563
- cache[extKey] = { title: data.title, text: data.text };
1563
+ cache[extKey] = { title: data.title, html: data.html };
1564
1564
  updatePopup(
1565
1565
  '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1566
- '<div style="font-family:sans-serif;font-size:0.82rem;line-height:1.55;color:#444;margin-top:0.5rem">' + escHtml(data.text) + '</div>',
1566
+ data.html,
1567
1567
  data.title || label);
1568
1568
  })
1569
1569
  .catch(function() {
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markdownr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.6
4
+ version: 0.4.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brian Dunn