markdownr 0.4.5 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/markdown_server/app.rb +82 -0
- data/lib/markdown_server/version.rb +1 -1
- data/views/layout.erb +40 -7
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 596b75e231efdc839c8729fe1db59a8b2933c08f12059cf753b261445d888813
|
|
4
|
+
data.tar.gz: ddc7fc42eb7f55286e0edcb296ec816567146e1ad412ae4df4243ab31255e4ea
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 34a2585d6a3818cd175d40f8b9f728bee1a83b3dbf2ac238c320b8b2be9d49c895dc9fb9709edfeed149317d4a03caa1536a87075da706711ceafa61a3c789d9
|
|
7
|
+
data.tar.gz: 256c941d8ad83a87fca9d205be075390c425c3b883dbd7fc3b2ed4f71e0f61c6cc29517a56897046c9c2a9c4507978b84445c9edeaf456e054527e9be1ed0f00
|
data/lib/markdown_server/app.rb
CHANGED
|
@@ -8,6 +8,7 @@ require "uri"
|
|
|
8
8
|
require "cgi"
|
|
9
9
|
require "pathname"
|
|
10
10
|
require "set"
|
|
11
|
+
require "net/http"
|
|
11
12
|
|
|
12
13
|
module MarkdownServer
|
|
13
14
|
class App < Sinatra::Base
|
|
@@ -431,6 +432,76 @@ module MarkdownServer
|
|
|
431
432
|
html
|
|
432
433
|
end
|
|
433
434
|
|
|
435
|
+
FETCH_MAX_BYTES = 512_000
|
|
436
|
+
FETCH_TIMEOUT = 5
|
|
437
|
+
|
|
438
|
+
def fetch_external_page(url_str)
|
|
439
|
+
uri = URI.parse(url_str)
|
|
440
|
+
return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
441
|
+
fetch_follow_redirects(uri, 5)
|
|
442
|
+
rescue
|
|
443
|
+
nil
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
def fetch_follow_redirects(uri, limit)
|
|
447
|
+
return nil if limit <= 0
|
|
448
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
449
|
+
http.use_ssl = (uri.scheme == "https")
|
|
450
|
+
http.open_timeout = FETCH_TIMEOUT
|
|
451
|
+
http.read_timeout = FETCH_TIMEOUT
|
|
452
|
+
req = Net::HTTP::Get.new(uri.request_uri)
|
|
453
|
+
req["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
454
|
+
req["Accept"] = "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8"
|
|
455
|
+
req["Accept-Language"] = "en-US,en;q=0.5"
|
|
456
|
+
resp = http.request(req)
|
|
457
|
+
case resp
|
|
458
|
+
when Net::HTTPSuccess
|
|
459
|
+
ct = resp["content-type"].to_s
|
|
460
|
+
return nil unless ct.match?(/html|text/i)
|
|
461
|
+
body = resp.body.to_s
|
|
462
|
+
body = body.b[0, FETCH_MAX_BYTES].force_encoding("utf-8")
|
|
463
|
+
body.encode("utf-8", invalid: :replace, undef: :replace, replace: "?")
|
|
464
|
+
when Net::HTTPRedirection
|
|
465
|
+
loc = resp["Location"].to_s
|
|
466
|
+
new_uri = (URI.parse(loc) rescue nil)
|
|
467
|
+
return nil unless new_uri
|
|
468
|
+
new_uri = uri + new_uri unless new_uri.absolute?
|
|
469
|
+
return nil unless new_uri.is_a?(URI::HTTP) || new_uri.is_a?(URI::HTTPS)
|
|
470
|
+
fetch_follow_redirects(new_uri, limit - 1)
|
|
471
|
+
end
|
|
472
|
+
rescue
|
|
473
|
+
nil
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
def page_title(html)
|
|
477
|
+
html.match(/<title[^>]*>(.*?)<\/title>/im)&.then { |m|
|
|
478
|
+
m[1].gsub(/<[^>]+>/, "").gsub(/&/i, "&").gsub(/</i, "<")
|
|
479
|
+
.gsub(/>/i, ">").gsub(/"/i, '"').gsub(/&#?\w+;/, "").strip
|
|
480
|
+
} || ""
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
def page_text(html)
|
|
484
|
+
# Strip inert elements
|
|
485
|
+
w = html
|
|
486
|
+
.gsub(/<script[^>]*>.*?<\/script>/im, " ")
|
|
487
|
+
.gsub(/<style[^>]*>.*?<\/style>/im, " ")
|
|
488
|
+
.gsub(/<nav[^>]*>.*?<\/nav>/im, " ")
|
|
489
|
+
.gsub(/<header[^>]*>.*?<\/header>/im," ")
|
|
490
|
+
.gsub(/<footer[^>]*>.*?<\/footer>/im," ")
|
|
491
|
+
.gsub(/<!--.*?-->/m, " ")
|
|
492
|
+
# Prefer <article> or <main>, fall back to <body>, then whole doc
|
|
493
|
+
content = w.match(/<article[^>]*>(.*?)<\/article>/im)&.[](1) ||
|
|
494
|
+
w.match(/<main[^>]*>(.*?)<\/main>/im)&.[](1) ||
|
|
495
|
+
w.match(/<body[^>]*>(.*?)<\/body>/im)&.[](1) ||
|
|
496
|
+
w
|
|
497
|
+
text = content
|
|
498
|
+
.gsub(/<[^>]+>/, " ")
|
|
499
|
+
.gsub(/ /i, " ").gsub(/&/i, "&").gsub(/</i, "<")
|
|
500
|
+
.gsub(/>/i, ">").gsub(/"/i, '"').gsub(/&#?\w+;/, " ")
|
|
501
|
+
.gsub(/\s+/, " ").strip
|
|
502
|
+
text.length > 3000 ? "#{text[0, 3000]}…" : text
|
|
503
|
+
end
|
|
504
|
+
|
|
434
505
|
def compile_regexes(query)
|
|
435
506
|
words = query.split(/\s+/).reject(&:empty?)
|
|
436
507
|
return nil if words.empty?
|
|
@@ -514,6 +585,17 @@ module MarkdownServer
|
|
|
514
585
|
JSON.dump({ title: title.to_s, html: html })
|
|
515
586
|
end
|
|
516
587
|
|
|
588
|
+
get "/fetch" do
|
|
589
|
+
content_type :json
|
|
590
|
+
url = params[:url].to_s.strip
|
|
591
|
+
halt 400, '{"error":"invalid url"}' unless url.match?(/\Ahttps?:\/\//i)
|
|
592
|
+
|
|
593
|
+
html = fetch_external_page(url)
|
|
594
|
+
halt 502, '{"error":"fetch failed"}' unless html
|
|
595
|
+
|
|
596
|
+
JSON.dump({ title: page_title(html), text: page_text(html) })
|
|
597
|
+
end
|
|
598
|
+
|
|
517
599
|
get "/search/?*" do
|
|
518
600
|
requested = params["splat"].first.to_s.chomp("/")
|
|
519
601
|
@query = params[:q].to_s.strip
|
data/views/layout.erb
CHANGED
|
@@ -1535,14 +1535,47 @@
|
|
|
1535
1535
|
});
|
|
1536
1536
|
}
|
|
1537
1537
|
}
|
|
1538
|
-
} else {
|
|
1539
|
-
var
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1538
|
+
} else if (isExternal(href)) {
|
|
1539
|
+
var extKey = 'ext:' + href;
|
|
1540
|
+
var extCached = cache[extKey];
|
|
1541
|
+
if (extCached && typeof extCached === 'object') {
|
|
1542
|
+
showPopup(x, y, extCached.title || label,
|
|
1543
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1544
|
+
'<div style="font-family:sans-serif;font-size:0.82rem;line-height:1.55;color:#444;margin-top:0.5rem">' + escHtml(extCached.text) + '</div>');
|
|
1545
|
+
} else if (extCached === false) {
|
|
1546
|
+
showPopup(x, y, label,
|
|
1547
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1548
|
+
'<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
|
|
1549
|
+
} else {
|
|
1550
|
+
showPopup(x, y, label, '<p style="opacity:0.5;margin:0;font-family:sans-serif">Loading\u2026</p>');
|
|
1551
|
+
if (extCached === undefined) {
|
|
1552
|
+
cache[extKey] = null;
|
|
1553
|
+
fetch('/fetch?url=' + encodeURIComponent(href))
|
|
1554
|
+
.then(function(r) { return r.ok ? r.json() : null; })
|
|
1555
|
+
.then(function(data) {
|
|
1556
|
+
if (!data || data.error) {
|
|
1557
|
+
cache[extKey] = false;
|
|
1558
|
+
updatePopup(
|
|
1559
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1560
|
+
'<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
|
|
1561
|
+
return;
|
|
1562
|
+
}
|
|
1563
|
+
cache[extKey] = { title: data.title, text: data.text };
|
|
1564
|
+
updatePopup(
|
|
1565
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1566
|
+
'<div style="font-family:sans-serif;font-size:0.82rem;line-height:1.55;color:#444;margin-top:0.5rem">' + escHtml(data.text) + '</div>',
|
|
1567
|
+
data.title || label);
|
|
1568
|
+
})
|
|
1569
|
+
.catch(function() {
|
|
1570
|
+
cache[extKey] = false;
|
|
1571
|
+
updatePopup(
|
|
1572
|
+
'<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
|
|
1573
|
+
'<p style="margin:0.5rem 0 0;color:#c44;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
|
|
1574
|
+
});
|
|
1575
|
+
}
|
|
1544
1576
|
}
|
|
1545
|
-
|
|
1577
|
+
} else {
|
|
1578
|
+
showPopup(x, y, label, '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>');
|
|
1546
1579
|
}
|
|
1547
1580
|
}
|
|
1548
1581
|
|