markdownr 0.4.5 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/markdown_server/app.rb +108 -0
- data/lib/markdown_server/version.rb +1 -1
- data/views/layout.erb +40 -7
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ae4a95fd815bc7acdaf267cf6ac5b056debfa7da5812b66c254e0ef002c01da6
|
|
4
|
+
data.tar.gz: cb4e91f2e5aab746588702770f331e20fea2d71e586c84835417cf5d6d27ffe7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 20729eb32420c42496a531229d72baf7692cd0de81eed5578f48e1a20fdf844fe4235367174c24b27e6cabbccf1223eb17dfe73c40a35e2058475334dd4fff50
|
|
7
|
+
data.tar.gz: b2902ec6c5bc7ae048da0c4f5f6d54cadd09003f8fdf524b59e2b96f519d4ca8b9a6e0cd30635266dfc64fa678eca09103aad5366bd96a840c995f6eaa000111
|
data/lib/markdown_server/app.rb
CHANGED
|
@@ -8,6 +8,7 @@ require "uri"
|
|
|
8
8
|
require "cgi"
|
|
9
9
|
require "pathname"
|
|
10
10
|
require "set"
|
|
11
|
+
require "net/http"
|
|
11
12
|
|
|
12
13
|
module MarkdownServer
|
|
13
14
|
class App < Sinatra::Base
|
|
@@ -431,6 +432,102 @@ module MarkdownServer
|
|
|
431
432
|
html
|
|
432
433
|
end
|
|
433
434
|
|
|
435
|
+
FETCH_MAX_BYTES = 512_000
|
|
436
|
+
FETCH_TIMEOUT = 5
|
|
437
|
+
|
|
438
|
+
# Tags kept as-is (attributes stripped)
|
|
439
|
+
ALLOWED_HTML = %w[p h1 h2 h3 h4 h5 h6 blockquote ul ol li
|
|
440
|
+
pre br hr strong b em i sup sub code
|
|
441
|
+
table tr td th].to_set
|
|
442
|
+
# Block containers — replaced with a newline (content kept)
|
|
443
|
+
BLOCK_HTML = %w[div section aside figure figcaption
|
|
444
|
+
thead tbody tfoot].to_set
|
|
445
|
+
# Elements removed completely, including their content
|
|
446
|
+
STRIP_FULL = %w[script style nav header footer form input
|
|
447
|
+
button select textarea svg iframe noscript].to_set
|
|
448
|
+
|
|
449
|
+
def fetch_external_page(url_str)
|
|
450
|
+
uri = URI.parse(url_str)
|
|
451
|
+
return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
452
|
+
fetch_follow_redirects(uri, 5)
|
|
453
|
+
rescue
|
|
454
|
+
nil
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
def fetch_follow_redirects(uri, limit)
|
|
458
|
+
return nil if limit <= 0
|
|
459
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
460
|
+
http.use_ssl = (uri.scheme == "https")
|
|
461
|
+
http.open_timeout = FETCH_TIMEOUT
|
|
462
|
+
http.read_timeout = FETCH_TIMEOUT
|
|
463
|
+
req = Net::HTTP::Get.new(uri.request_uri)
|
|
464
|
+
req["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
465
|
+
req["Accept"] = "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8"
|
|
466
|
+
req["Accept-Language"] = "en-US,en;q=0.5"
|
|
467
|
+
resp = http.request(req)
|
|
468
|
+
case resp
|
|
469
|
+
when Net::HTTPSuccess
|
|
470
|
+
ct = resp["content-type"].to_s
|
|
471
|
+
return nil unless ct.match?(/html|text/i)
|
|
472
|
+
body = resp.body.to_s
|
|
473
|
+
body = body.b[0, FETCH_MAX_BYTES].force_encoding("utf-8")
|
|
474
|
+
body.encode("utf-8", invalid: :replace, undef: :replace, replace: "?")
|
|
475
|
+
when Net::HTTPRedirection
|
|
476
|
+
loc = resp["Location"].to_s
|
|
477
|
+
new_uri = (URI.parse(loc) rescue nil)
|
|
478
|
+
return nil unless new_uri
|
|
479
|
+
new_uri = uri + new_uri unless new_uri.absolute?
|
|
480
|
+
return nil unless new_uri.is_a?(URI::HTTP) || new_uri.is_a?(URI::HTTPS)
|
|
481
|
+
fetch_follow_redirects(new_uri, limit - 1)
|
|
482
|
+
end
|
|
483
|
+
rescue
|
|
484
|
+
nil
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
def page_title(html)
|
|
488
|
+
html.match(/<title[^>]*>(.*?)<\/title>/im)&.then { |m|
|
|
489
|
+
m[1].gsub(/<[^>]+>/, "").gsub(/&/i, "&").gsub(/</i, "<")
|
|
490
|
+
.gsub(/>/i, ">").gsub(/"/i, '"').gsub(/&#?\w+;/, "").strip
|
|
491
|
+
} || ""
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
def page_html(raw)
|
|
495
|
+
w = raw.dup
|
|
496
|
+
# Remove inert elements and their entire contents
|
|
497
|
+
STRIP_FULL.each { |t| w.gsub!(/<#{t}[^>]*>.*?<\/#{t}>/im, " ") }
|
|
498
|
+
w.gsub!(/<!--.*?-->/m, " ")
|
|
499
|
+
|
|
500
|
+
# Prefer a focused content block
|
|
501
|
+
content = w.match(/<article[^>]*>(.*?)<\/article>/im)&.[](1) ||
|
|
502
|
+
w.match(/<main[^>]*>(.*?)<\/main>/im)&.[](1) ||
|
|
503
|
+
w.match(/<body[^>]*>(.*?)<\/body>/im)&.[](1) ||
|
|
504
|
+
w
|
|
505
|
+
|
|
506
|
+
# Rewrite tags: keep allowed (strip attrs), block→newline, rest→empty
|
|
507
|
+
out = content.gsub(/<(\/?)(\w+)[^>]*>/) do
|
|
508
|
+
slash, tag = $1, $2.downcase
|
|
509
|
+
if ALLOWED_HTML.include?(tag) then "<#{slash}#{tag}>"
|
|
510
|
+
elsif BLOCK_HTML.include?(tag) then "\n"
|
|
511
|
+
else ""
|
|
512
|
+
end
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# Decode HTML entities
|
|
516
|
+
out = out
|
|
517
|
+
.gsub(/ /i, " ").gsub(/&/i, "&").gsub(/</i, "<").gsub(/>/i, ">")
|
|
518
|
+
.gsub(/"/i, '"').gsub(/'/i, "'")
|
|
519
|
+
.gsub(/—/i, "—").gsub(/–/i, "–").gsub(/…/i, "…")
|
|
520
|
+
.gsub(/&#(\d+);/) { [$1.to_i].pack("U") rescue " " }
|
|
521
|
+
.gsub(/&#x([\da-f]+);/i) { [$1.to_i(16)].pack("U") rescue " " }
|
|
522
|
+
.gsub(/&\w+;/, " ")
|
|
523
|
+
.gsub(/[ \t]+/, " ")
|
|
524
|
+
.gsub(/\n{3,}/, "\n\n")
|
|
525
|
+
.gsub(/<(\w+)>\s*<\/\1>/, "") # drop empty tags
|
|
526
|
+
.strip
|
|
527
|
+
|
|
528
|
+
out.length > 10_000 ? out[0, 10_000] : out
|
|
529
|
+
end
|
|
530
|
+
|
|
434
531
|
def compile_regexes(query)
|
|
435
532
|
words = query.split(/\s+/).reject(&:empty?)
|
|
436
533
|
return nil if words.empty?
|
|
@@ -514,6 +611,17 @@ module MarkdownServer
|
|
|
514
611
|
JSON.dump({ title: title.to_s, html: html })
|
|
515
612
|
end
|
|
516
613
|
|
|
614
|
+
get "/fetch" do
|
|
615
|
+
content_type :json
|
|
616
|
+
url = params[:url].to_s.strip
|
|
617
|
+
halt 400, '{"error":"invalid url"}' unless url.match?(/\Ahttps?:\/\//i)
|
|
618
|
+
|
|
619
|
+
html = fetch_external_page(url)
|
|
620
|
+
halt 502, '{"error":"fetch failed"}' unless html
|
|
621
|
+
|
|
622
|
+
JSON.dump({ title: page_title(html), html: page_html(html) })
|
|
623
|
+
end
|
|
624
|
+
|
|
517
625
|
get "/search/?*" do
|
|
518
626
|
requested = params["splat"].first.to_s.chomp("/")
|
|
519
627
|
@query = params[:q].to_s.strip
|
data/views/layout.erb
CHANGED
|
@@ -1535,14 +1535,47 @@
|
|
|
1535
1535
|
});
|
|
1536
1536
|
}
|
|
1537
1537
|
}
|
|
1538
|
-
} else {
|
|
1539
|
-
var
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1538
|
+
} else if (isExternal(href)) {
|
|
1539
|
+
var extKey = 'ext:' + href;
|
|
1540
|
+
var extCached = cache[extKey];
|
|
1541
|
+
if (extCached && typeof extCached === 'object') {
|
|
1542
|
+
showPopup(x, y, extCached.title || label,
|
|
1543
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1544
|
+
extCached.html);
|
|
1545
|
+
} else if (extCached === false) {
|
|
1546
|
+
showPopup(x, y, label,
|
|
1547
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1548
|
+
'<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
|
|
1549
|
+
} else {
|
|
1550
|
+
showPopup(x, y, label, '<p style="opacity:0.5;margin:0;font-family:sans-serif">Loading\u2026</p>');
|
|
1551
|
+
if (extCached === undefined) {
|
|
1552
|
+
cache[extKey] = null;
|
|
1553
|
+
fetch('/fetch?url=' + encodeURIComponent(href))
|
|
1554
|
+
.then(function(r) { return r.ok ? r.json() : null; })
|
|
1555
|
+
.then(function(data) {
|
|
1556
|
+
if (!data || data.error) {
|
|
1557
|
+
cache[extKey] = false;
|
|
1558
|
+
updatePopup(
|
|
1559
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1560
|
+
'<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
|
|
1561
|
+
return;
|
|
1562
|
+
}
|
|
1563
|
+
cache[extKey] = { title: data.title, html: data.html };
|
|
1564
|
+
updatePopup(
|
|
1565
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1566
|
+
data.html,
|
|
1567
|
+
data.title || label);
|
|
1568
|
+
})
|
|
1569
|
+
.catch(function() {
|
|
1570
|
+
cache[extKey] = false;
|
|
1571
|
+
updatePopup(
|
|
1572
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1573
|
+
'<p style="margin:0.5rem 0 0;color:#c44;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
|
|
1574
|
+
});
|
|
1575
|
+
}
|
|
1544
1576
|
}
|
|
1545
|
-
|
|
1577
|
+
} else {
|
|
1578
|
+
showPopup(x, y, label, '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>');
|
|
1546
1579
|
}
|
|
1547
1580
|
}
|
|
1548
1581
|
|