markdownr 0.4.6 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/markdown_server/app.rb +43 -17
- data/lib/markdown_server/version.rb +1 -1
- data/views/layout.erb +3 -3
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ae4a95fd815bc7acdaf267cf6ac5b056debfa7da5812b66c254e0ef002c01da6
|
|
4
|
+
data.tar.gz: cb4e91f2e5aab746588702770f331e20fea2d71e586c84835417cf5d6d27ffe7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 20729eb32420c42496a531229d72baf7692cd0de81eed5578f48e1a20fdf844fe4235367174c24b27e6cabbccf1223eb17dfe73c40a35e2058475334dd4fff50
|
|
7
|
+
data.tar.gz: b2902ec6c5bc7ae048da0c4f5f6d54cadd09003f8fdf524b59e2b96f519d4ca8b9a6e0cd30635266dfc64fa678eca09103aad5366bd96a840c995f6eaa000111
|
data/lib/markdown_server/app.rb
CHANGED
|
@@ -435,6 +435,17 @@ module MarkdownServer
|
|
|
435
435
|
FETCH_MAX_BYTES = 512_000
|
|
436
436
|
FETCH_TIMEOUT = 5
|
|
437
437
|
|
|
438
|
+
# Tags kept as-is (attributes stripped)
|
|
439
|
+
ALLOWED_HTML = %w[p h1 h2 h3 h4 h5 h6 blockquote ul ol li
|
|
440
|
+
pre br hr strong b em i sup sub code
|
|
441
|
+
table tr td th].to_set
|
|
442
|
+
# Block containers — replaced with a newline (content kept)
|
|
443
|
+
BLOCK_HTML = %w[div section aside figure figcaption
|
|
444
|
+
thead tbody tfoot].to_set
|
|
445
|
+
# Elements removed completely, including their content
|
|
446
|
+
STRIP_FULL = %w[script style nav header footer form input
|
|
447
|
+
button select textarea svg iframe noscript].to_set
|
|
448
|
+
|
|
438
449
|
def fetch_external_page(url_str)
|
|
439
450
|
uri = URI.parse(url_str)
|
|
440
451
|
return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
@@ -480,26 +491,41 @@ module MarkdownServer
|
|
|
480
491
|
} || ""
|
|
481
492
|
end
|
|
482
493
|
|
|
483
|
-
def
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
.gsub(/<footer[^>]*>.*?<\/footer>/im," ")
|
|
491
|
-
.gsub(/<!--.*?-->/m, " ")
|
|
492
|
-
# Prefer <article> or <main>, fall back to <body>, then whole doc
|
|
494
|
+
def page_html(raw)
|
|
495
|
+
w = raw.dup
|
|
496
|
+
# Remove inert elements and their entire contents
|
|
497
|
+
STRIP_FULL.each { |t| w.gsub!(/<#{t}[^>]*>.*?<\/#{t}>/im, " ") }
|
|
498
|
+
w.gsub!(/<!--.*?-->/m, " ")
|
|
499
|
+
|
|
500
|
+
# Prefer a focused content block
|
|
493
501
|
content = w.match(/<article[^>]*>(.*?)<\/article>/im)&.[](1) ||
|
|
494
502
|
w.match(/<main[^>]*>(.*?)<\/main>/im)&.[](1) ||
|
|
495
503
|
w.match(/<body[^>]*>(.*?)<\/body>/im)&.[](1) ||
|
|
496
504
|
w
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
.
|
|
502
|
-
|
|
505
|
+
|
|
506
|
+
# Rewrite tags: keep allowed (strip attrs), block→newline, rest→empty
|
|
507
|
+
out = content.gsub(/<(\/?)(\w+)[^>]*>/) do
|
|
508
|
+
slash, tag = $1, $2.downcase
|
|
509
|
+
if ALLOWED_HTML.include?(tag) then "<#{slash}#{tag}>"
|
|
510
|
+
elsif BLOCK_HTML.include?(tag) then "\n"
|
|
511
|
+
else ""
|
|
512
|
+
end
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# Decode HTML entities
|
|
516
|
+
out = out
|
|
517
|
+
.gsub(/ /i, " ").gsub(/&/i, "&").gsub(/</i, "<").gsub(/>/i, ">")
|
|
518
|
+
.gsub(/"/i, '"').gsub(/'/i, "'")
|
|
519
|
+
.gsub(/—/i, "—").gsub(/–/i, "–").gsub(/…/i, "…")
|
|
520
|
+
.gsub(/&#(\d+);/) { [$1.to_i].pack("U") rescue " " }
|
|
521
|
+
.gsub(/&#x([\da-f]+);/i) { [$1.to_i(16)].pack("U") rescue " " }
|
|
522
|
+
.gsub(/&\w+;/, " ")
|
|
523
|
+
.gsub(/[ \t]+/, " ")
|
|
524
|
+
.gsub(/\n{3,}/, "\n\n")
|
|
525
|
+
.gsub(/<(\w+)>\s*<\/\1>/, "") # drop empty tags
|
|
526
|
+
.strip
|
|
527
|
+
|
|
528
|
+
out.length > 10_000 ? out[0, 10_000] : out
|
|
503
529
|
end
|
|
504
530
|
|
|
505
531
|
def compile_regexes(query)
|
|
@@ -593,7 +619,7 @@ module MarkdownServer
|
|
|
593
619
|
html = fetch_external_page(url)
|
|
594
620
|
halt 502, '{"error":"fetch failed"}' unless html
|
|
595
621
|
|
|
596
|
-
JSON.dump({ title: page_title(html),
|
|
622
|
+
JSON.dump({ title: page_title(html), html: page_html(html) })
|
|
597
623
|
end
|
|
598
624
|
|
|
599
625
|
get "/search/?*" do
|
data/views/layout.erb
CHANGED
|
@@ -1541,7 +1541,7 @@
|
|
|
1541
1541
|
if (extCached && typeof extCached === 'object') {
|
|
1542
1542
|
showPopup(x, y, extCached.title || label,
|
|
1543
1543
|
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1544
|
-
|
|
1544
|
+
extCached.html);
|
|
1545
1545
|
} else if (extCached === false) {
|
|
1546
1546
|
showPopup(x, y, label,
|
|
1547
1547
|
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
@@ -1560,10 +1560,10 @@
|
|
|
1560
1560
|
'<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
|
|
1561
1561
|
return;
|
|
1562
1562
|
}
|
|
1563
|
-
cache[extKey] = { title: data.title,
|
|
1563
|
+
cache[extKey] = { title: data.title, html: data.html };
|
|
1564
1564
|
updatePopup(
|
|
1565
1565
|
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1566
|
-
|
|
1566
|
+
data.html,
|
|
1567
1567
|
data.title || label);
|
|
1568
1568
|
})
|
|
1569
1569
|
.catch(function() {
|