markdownr 0.4.5 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 53062f10b38520d041fb4fab0e1155e1b19e15242409996dd82122716bc727d8
4
- data.tar.gz: 17ee73d28a5ddc64a915d190f31c157c6d8c9319451c1fe754c3c41417bab9a9
3
+ metadata.gz: ae4a95fd815bc7acdaf267cf6ac5b056debfa7da5812b66c254e0ef002c01da6
4
+ data.tar.gz: cb4e91f2e5aab746588702770f331e20fea2d71e586c84835417cf5d6d27ffe7
5
5
  SHA512:
6
- metadata.gz: 8c762e19d73d839cdcf1ff5f4e16c009f22b575a5672936eabef2d5131641d639ae26771bce19f25adbb287e6f7c5d71bb0561b76f302ea32db4b3ad41dc2937
7
- data.tar.gz: e28d99ad58d6ee904aa3814f665fa31d0313605629be193a482b6f0ef30cce74868800a18cec6db0ecb0101f3182dc48d6b8b561a75d9e39f04db8b7daa9e443
6
+ metadata.gz: 20729eb32420c42496a531229d72baf7692cd0de81eed5578f48e1a20fdf844fe4235367174c24b27e6cabbccf1223eb17dfe73c40a35e2058475334dd4fff50
7
+ data.tar.gz: b2902ec6c5bc7ae048da0c4f5f6d54cadd09003f8fdf524b59e2b96f519d4ca8b9a6e0cd30635266dfc64fa678eca09103aad5366bd96a840c995f6eaa000111
@@ -8,6 +8,7 @@ require "uri"
8
8
  require "cgi"
9
9
  require "pathname"
10
10
  require "set"
11
+ require "net/http"
11
12
 
12
13
  module MarkdownServer
13
14
  class App < Sinatra::Base
@@ -431,6 +432,102 @@ module MarkdownServer
431
432
  html
432
433
  end
433
434
 
435
+ FETCH_MAX_BYTES = 512_000
436
+ FETCH_TIMEOUT = 5
437
+
438
+ # Tags kept as-is (attributes stripped)
439
+ ALLOWED_HTML = %w[p h1 h2 h3 h4 h5 h6 blockquote ul ol li
440
+ pre br hr strong b em i sup sub code
441
+ table tr td th].to_set
442
+ # Block containers — replaced with a newline (content kept)
443
+ BLOCK_HTML = %w[div section aside figure figcaption
444
+ thead tbody tfoot].to_set
445
+ # Elements removed completely, including their content
446
+ STRIP_FULL = %w[script style nav header footer form input
447
+ button select textarea svg iframe noscript].to_set
448
+
449
+ def fetch_external_page(url_str)
450
+ uri = URI.parse(url_str)
451
+ return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
452
+ fetch_follow_redirects(uri, 5)
453
+ rescue
454
+ nil
455
+ end
456
+
457
+ def fetch_follow_redirects(uri, limit)
458
+ return nil if limit <= 0
459
+ http = Net::HTTP.new(uri.host, uri.port)
460
+ http.use_ssl = (uri.scheme == "https")
461
+ http.open_timeout = FETCH_TIMEOUT
462
+ http.read_timeout = FETCH_TIMEOUT
463
+ req = Net::HTTP::Get.new(uri.request_uri)
464
+ req["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
465
+ req["Accept"] = "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8"
466
+ req["Accept-Language"] = "en-US,en;q=0.5"
467
+ resp = http.request(req)
468
+ case resp
469
+ when Net::HTTPSuccess
470
+ ct = resp["content-type"].to_s
471
+ return nil unless ct.match?(/html|text/i)
472
+ body = resp.body.to_s
473
+ body = body.b[0, FETCH_MAX_BYTES].force_encoding("utf-8")
474
+ body.encode("utf-8", invalid: :replace, undef: :replace, replace: "?")
475
+ when Net::HTTPRedirection
476
+ loc = resp["Location"].to_s
477
+ new_uri = (URI.parse(loc) rescue nil)
478
+ return nil unless new_uri
479
+ new_uri = uri + new_uri unless new_uri.absolute?
480
+ return nil unless new_uri.is_a?(URI::HTTP) || new_uri.is_a?(URI::HTTPS)
481
+ fetch_follow_redirects(new_uri, limit - 1)
482
+ end
483
+ rescue
484
+ nil
485
+ end
486
+
487
+ def page_title(html)
488
+ html.match(/<title[^>]*>(.*?)<\/title>/im)&.then { |m|
489
+ m[1].gsub(/<[^>]+>/, "").gsub(/&amp;/i, "&").gsub(/&lt;/i, "<")
490
+ .gsub(/&gt;/i, ">").gsub(/&quot;/i, '"').gsub(/&#?\w+;/, "").strip
491
+ } || ""
492
+ end
493
+
494
+ def page_html(raw)
495
+ w = raw.dup
496
+ # Remove inert elements and their entire contents
497
+ STRIP_FULL.each { |t| w.gsub!(/<#{t}[^>]*>.*?<\/#{t}>/im, " ") }
498
+ w.gsub!(/<!--.*?-->/m, " ")
499
+
500
+ # Prefer a focused content block
501
+ content = w.match(/<article[^>]*>(.*?)<\/article>/im)&.[](1) ||
502
+ w.match(/<main[^>]*>(.*?)<\/main>/im)&.[](1) ||
503
+ w.match(/<body[^>]*>(.*?)<\/body>/im)&.[](1) ||
504
+ w
505
+
506
+ # Rewrite tags: keep allowed (strip attrs), block→newline, rest→empty
507
+ out = content.gsub(/<(\/?)(\w+)[^>]*>/) do
508
+ slash, tag = $1, $2.downcase
509
+ if ALLOWED_HTML.include?(tag) then "<#{slash}#{tag}>"
510
+ elsif BLOCK_HTML.include?(tag) then "\n"
511
+ else ""
512
+ end
513
+ end
514
+
515
+ # Decode HTML entities
516
+ out = out
517
+ .gsub(/&nbsp;/i, " ").gsub(/&amp;/i, "&").gsub(/&lt;/i, "<").gsub(/&gt;/i, ">")
518
+ .gsub(/&quot;/i, '"').gsub(/&apos;/i, "'")
519
+ .gsub(/&mdash;/i, "—").gsub(/&ndash;/i, "–").gsub(/&hellip;/i, "…")
520
+ .gsub(/&#(\d+);/) { [$1.to_i].pack("U") rescue " " }
521
+ .gsub(/&#x([\da-f]+);/i) { [$1.to_i(16)].pack("U") rescue " " }
522
+ .gsub(/&\w+;/, " ")
523
+ .gsub(/[ \t]+/, " ")
524
+ .gsub(/\n{3,}/, "\n\n")
525
+ .gsub(/<(\w+)>\s*<\/\1>/, "") # drop empty tags
526
+ .strip
527
+
528
+ out.length > 10_000 ? out[0, 10_000] : out
529
+ end
530
+
434
531
  def compile_regexes(query)
435
532
  words = query.split(/\s+/).reject(&:empty?)
436
533
  return nil if words.empty?
@@ -514,6 +611,17 @@ module MarkdownServer
514
611
  JSON.dump({ title: title.to_s, html: html })
515
612
  end
516
613
 
614
+ get "/fetch" do
615
+ content_type :json
616
+ url = params[:url].to_s.strip
617
+ halt 400, '{"error":"invalid url"}' unless url.match?(/\Ahttps?:\/\//i)
618
+
619
+ html = fetch_external_page(url)
620
+ halt 502, '{"error":"fetch failed"}' unless html
621
+
622
+ JSON.dump({ title: page_title(html), html: page_html(html) })
623
+ end
624
+
517
625
  get "/search/?*" do
518
626
  requested = params["splat"].first.to_s.chomp("/")
519
627
  @query = params[:q].to_s.strip
@@ -1,3 +1,3 @@
1
1
  module MarkdownServer
2
- VERSION = "0.4.5"
2
+ VERSION = "0.4.7"
3
3
  end
data/views/layout.erb CHANGED
@@ -1535,14 +1535,47 @@
1535
1535
  });
1536
1536
  }
1537
1537
  }
1538
- } else {
1539
- var urlBody = '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>';
1540
- if (isExternal(href)) {
1541
- var host = '';
1542
- try { host = new URL(href).hostname; } catch(e) {}
1543
- if (host) urlBody += '<p style="margin:0;color:#888;font-family:sans-serif;font-size:0.82rem">External: <strong style="color:#555">' + escHtml(host) + '</strong></p>';
1538
+ } else if (isExternal(href)) {
1539
+ var extKey = 'ext:' + href;
1540
+ var extCached = cache[extKey];
1541
+ if (extCached && typeof extCached === 'object') {
1542
+ showPopup(x, y, extCached.title || label,
1543
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1544
+ extCached.html);
1545
+ } else if (extCached === false) {
1546
+ showPopup(x, y, label,
1547
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1548
+ '<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
1549
+ } else {
1550
+ showPopup(x, y, label, '<p style="opacity:0.5;margin:0;font-family:sans-serif">Loading\u2026</p>');
1551
+ if (extCached === undefined) {
1552
+ cache[extKey] = null;
1553
+ fetch('/fetch?url=' + encodeURIComponent(href))
1554
+ .then(function(r) { return r.ok ? r.json() : null; })
1555
+ .then(function(data) {
1556
+ if (!data || data.error) {
1557
+ cache[extKey] = false;
1558
+ updatePopup(
1559
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1560
+ '<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
1561
+ return;
1562
+ }
1563
+ cache[extKey] = { title: data.title, html: data.html };
1564
+ updatePopup(
1565
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1566
+ data.html,
1567
+ data.title || label);
1568
+ })
1569
+ .catch(function() {
1570
+ cache[extKey] = false;
1571
+ updatePopup(
1572
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1573
+ '<p style="margin:0.5rem 0 0;color:#c44;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
1574
+ });
1575
+ }
1544
1576
  }
1545
- showPopup(x, y, label, urlBody);
1577
+ } else {
1578
+ showPopup(x, y, label, '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>');
1546
1579
  }
1547
1580
  }
1548
1581
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markdownr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.5
4
+ version: 0.4.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brian Dunn