markdownr 0.4.5 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 53062f10b38520d041fb4fab0e1155e1b19e15242409996dd82122716bc727d8
4
- data.tar.gz: 17ee73d28a5ddc64a915d190f31c157c6d8c9319451c1fe754c3c41417bab9a9
3
+ metadata.gz: 596b75e231efdc839c8729fe1db59a8b2933c08f12059cf753b261445d888813
4
+ data.tar.gz: ddc7fc42eb7f55286e0edcb296ec816567146e1ad412ae4df4243ab31255e4ea
5
5
  SHA512:
6
- metadata.gz: 8c762e19d73d839cdcf1ff5f4e16c009f22b575a5672936eabef2d5131641d639ae26771bce19f25adbb287e6f7c5d71bb0561b76f302ea32db4b3ad41dc2937
7
- data.tar.gz: e28d99ad58d6ee904aa3814f665fa31d0313605629be193a482b6f0ef30cce74868800a18cec6db0ecb0101f3182dc48d6b8b561a75d9e39f04db8b7daa9e443
6
+ metadata.gz: 34a2585d6a3818cd175d40f8b9f728bee1a83b3dbf2ac238c320b8b2be9d49c895dc9fb9709edfeed149317d4a03caa1536a87075da706711ceafa61a3c789d9
7
+ data.tar.gz: 256c941d8ad83a87fca9d205be075390c425c3b883dbd7fc3b2ed4f71e0f61c6cc29517a56897046c9c2a9c4507978b84445c9edeaf456e054527e9be1ed0f00
@@ -8,6 +8,7 @@ require "uri"
8
8
  require "cgi"
9
9
  require "pathname"
10
10
  require "set"
11
+ require "net/http"
11
12
 
12
13
  module MarkdownServer
13
14
  class App < Sinatra::Base
@@ -431,6 +432,76 @@ module MarkdownServer
431
432
  html
432
433
  end
433
434
 
435
+ FETCH_MAX_BYTES = 512_000
436
+ FETCH_TIMEOUT = 5
437
+
438
+ def fetch_external_page(url_str)
439
+ uri = URI.parse(url_str)
440
+ return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
441
+ fetch_follow_redirects(uri, 5)
442
+ rescue
443
+ nil
444
+ end
445
+
446
+ def fetch_follow_redirects(uri, limit)
447
+ return nil if limit <= 0
448
+ http = Net::HTTP.new(uri.host, uri.port)
449
+ http.use_ssl = (uri.scheme == "https")
450
+ http.open_timeout = FETCH_TIMEOUT
451
+ http.read_timeout = FETCH_TIMEOUT
452
+ req = Net::HTTP::Get.new(uri.request_uri)
453
+ req["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
454
+ req["Accept"] = "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8"
455
+ req["Accept-Language"] = "en-US,en;q=0.5"
456
+ resp = http.request(req)
457
+ case resp
458
+ when Net::HTTPSuccess
459
+ ct = resp["content-type"].to_s
460
+ return nil unless ct.match?(/html|text/i)
461
+ body = resp.body.to_s
462
+ body = body.b[0, FETCH_MAX_BYTES].force_encoding("utf-8")
463
+ body.encode("utf-8", invalid: :replace, undef: :replace, replace: "?")
464
+ when Net::HTTPRedirection
465
+ loc = resp["Location"].to_s
466
+ new_uri = (URI.parse(loc) rescue nil)
467
+ return nil unless new_uri
468
+ new_uri = uri + new_uri unless new_uri.absolute?
469
+ return nil unless new_uri.is_a?(URI::HTTP) || new_uri.is_a?(URI::HTTPS)
470
+ fetch_follow_redirects(new_uri, limit - 1)
471
+ end
472
+ rescue
473
+ nil
474
+ end
475
+
476
+ def page_title(html)
477
+ html.match(/<title[^>]*>(.*?)<\/title>/im)&.then { |m|
478
+ m[1].gsub(/<[^>]+>/, "").gsub(/&amp;/i, "&").gsub(/&lt;/i, "<")
479
+ .gsub(/&gt;/i, ">").gsub(/&quot;/i, '"').gsub(/&#?\w+;/, "").strip
480
+ } || ""
481
+ end
482
+
483
+ def page_text(html)
484
+ # Strip inert elements
485
+ w = html
486
+ .gsub(/<script[^>]*>.*?<\/script>/im, " ")
487
+ .gsub(/<style[^>]*>.*?<\/style>/im, " ")
488
+ .gsub(/<nav[^>]*>.*?<\/nav>/im, " ")
489
+ .gsub(/<header[^>]*>.*?<\/header>/im," ")
490
+ .gsub(/<footer[^>]*>.*?<\/footer>/im," ")
491
+ .gsub(/<!--.*?-->/m, " ")
492
+ # Prefer <article> or <main>, fall back to <body>, then whole doc
493
+ content = w.match(/<article[^>]*>(.*?)<\/article>/im)&.[](1) ||
494
+ w.match(/<main[^>]*>(.*?)<\/main>/im)&.[](1) ||
495
+ w.match(/<body[^>]*>(.*?)<\/body>/im)&.[](1) ||
496
+ w
497
+ text = content
498
+ .gsub(/<[^>]+>/, " ")
499
+ .gsub(/&nbsp;/i, " ").gsub(/&amp;/i, "&").gsub(/&lt;/i, "<")
500
+ .gsub(/&gt;/i, ">").gsub(/&quot;/i, '"').gsub(/&#?\w+;/, " ")
501
+ .gsub(/\s+/, " ").strip
502
+ text.length > 3000 ? "#{text[0, 3000]}…" : text
503
+ end
504
+
434
505
  def compile_regexes(query)
435
506
  words = query.split(/\s+/).reject(&:empty?)
436
507
  return nil if words.empty?
@@ -514,6 +585,17 @@ module MarkdownServer
514
585
  JSON.dump({ title: title.to_s, html: html })
515
586
  end
516
587
 
588
+ get "/fetch" do
589
+ content_type :json
590
+ url = params[:url].to_s.strip
591
+ halt 400, '{"error":"invalid url"}' unless url.match?(/\Ahttps?:\/\//i)
592
+
593
+ html = fetch_external_page(url)
594
+ halt 502, '{"error":"fetch failed"}' unless html
595
+
596
+ JSON.dump({ title: page_title(html), text: page_text(html) })
597
+ end
598
+
517
599
  get "/search/?*" do
518
600
  requested = params["splat"].first.to_s.chomp("/")
519
601
  @query = params[:q].to_s.strip
@@ -1,3 +1,3 @@
1
1
  module MarkdownServer
2
- VERSION = "0.4.5"
2
+ VERSION = "0.4.6"
3
3
  end
data/views/layout.erb CHANGED
@@ -1535,14 +1535,47 @@
1535
1535
  });
1536
1536
  }
1537
1537
  }
1538
- } else {
1539
- var urlBody = '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>';
1540
- if (isExternal(href)) {
1541
- var host = '';
1542
- try { host = new URL(href).hostname; } catch(e) {}
1543
- if (host) urlBody += '<p style="margin:0;color:#888;font-family:sans-serif;font-size:0.82rem">External: <strong style="color:#555">' + escHtml(host) + '</strong></p>';
1538
+ } else if (isExternal(href)) {
1539
+ var extKey = 'ext:' + href;
1540
+ var extCached = cache[extKey];
1541
+ if (extCached && typeof extCached === 'object') {
1542
+ showPopup(x, y, extCached.title || label,
1543
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1544
+ '<div style="font-family:sans-serif;font-size:0.82rem;line-height:1.55;color:#444;margin-top:0.5rem">' + escHtml(extCached.text) + '</div>');
1545
+ } else if (extCached === false) {
1546
+ showPopup(x, y, label,
1547
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1548
+ '<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
1549
+ } else {
1550
+ showPopup(x, y, label, '<p style="opacity:0.5;margin:0;font-family:sans-serif">Loading\u2026</p>');
1551
+ if (extCached === undefined) {
1552
+ cache[extKey] = null;
1553
+ fetch('/fetch?url=' + encodeURIComponent(href))
1554
+ .then(function(r) { return r.ok ? r.json() : null; })
1555
+ .then(function(data) {
1556
+ if (!data || data.error) {
1557
+ cache[extKey] = false;
1558
+ updatePopup(
1559
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1560
+ '<p style="margin:0.5rem 0 0;color:#888;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
1561
+ return;
1562
+ }
1563
+ cache[extKey] = { title: data.title, text: data.text };
1564
+ updatePopup(
1565
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1566
+ '<div style="font-family:sans-serif;font-size:0.82rem;line-height:1.55;color:#444;margin-top:0.5rem">' + escHtml(data.text) + '</div>',
1567
+ data.title || label);
1568
+ })
1569
+ .catch(function() {
1570
+ cache[extKey] = false;
1571
+ updatePopup(
1572
+ '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>' +
1573
+ '<p style="margin:0.5rem 0 0;color:#c44;font-family:sans-serif;font-size:0.82rem">Could not fetch page content.</p>');
1574
+ });
1575
+ }
1544
1576
  }
1545
- showPopup(x, y, label, urlBody);
1577
+ } else {
1578
+ showPopup(x, y, label, '<div class="link-ctx-popup-url">' + escHtml(href) + '</div>');
1546
1579
  }
1547
1580
  }
1548
1581
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markdownr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.5
4
+ version: 0.4.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brian Dunn