ruby-readability 0.7.0 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +25 -0
- data/.rspec +1 -1
- data/README.md +3 -6
- data/lib/readability.rb +73 -21
- data/ruby-readability.gemspec +1 -4
- data/spec/fixtures/codinghorror.html +189 -0
- data/spec/fixtures/images/Confusion_of_Tongues.png +0 -0
- data/spec/fixtures/images/JohnPinhole.jpg +0 -0
- data/spec/fixtures/nested_images.html +11 -0
- data/spec/readability_spec.rb +315 -100
- data/spec/spec_helper.rb +0 -6
- metadata +28 -35
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f83eb55e4c0c4c30ad54e8e7104d68da8a5eb2b4d9cc76b45255055d89bf4b5c
|
4
|
+
data.tar.gz: 4d003c39b589477449bedd34634c5482dd503e94bfe24b9a5c29ea94f9b49f83
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e799e831297b18b381c3b1caad19531f99fe084f640afbddd1cf91e75fe234d3af4618f07e02a0c6214824726e3afe79accbb8ea5f0d66d9117b13112d22e8ef
|
7
|
+
data.tar.gz: 404d3a1bc702f3bd609e8c3ba8e37d6f023b2a3c126c278e7463a3dfee1cc5bf683f6c0c75cfabbb14e477f582b33cc8204d8682f33ed9a235b6fac8e90d9ad2
|
@@ -0,0 +1,25 @@
|
|
1
|
+
name: Ruby
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [ master ]
|
6
|
+
pull_request:
|
7
|
+
branches: [ master ]
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
test:
|
11
|
+
|
12
|
+
runs-on: ubuntu-latest
|
13
|
+
strategy:
|
14
|
+
matrix:
|
15
|
+
ruby-version: ['2.7']
|
16
|
+
|
17
|
+
steps:
|
18
|
+
- uses: actions/checkout@v2
|
19
|
+
- name: Set up Ruby
|
20
|
+
uses: ruby/setup-ruby@v1
|
21
|
+
with:
|
22
|
+
ruby-version: ${{ matrix.ruby-version }}
|
23
|
+
bundler-cache: true # runs 'bundle install' and caches installed gems automatically
|
24
|
+
- name: Run tests
|
25
|
+
run: bundle exec rspec
|
data/.rspec
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
--colour
|
2
|
-
--format
|
2
|
+
--format documentation -c
|
data/README.md
CHANGED
@@ -7,7 +7,7 @@ webpage. It is a Ruby port of arc90's readability project.
|
|
7
7
|
Build Status
|
8
8
|
------------
|
9
9
|
|
10
|
-
[![
|
10
|
+
[![Ruby](https://github.com/cantino/ruby-readability/actions/workflows/ruby.yml/badge.svg?branch=master)](https://github.com/cantino/ruby-readability/actions/workflows/ruby.yml)
|
11
11
|
|
12
12
|
Install
|
13
13
|
-------
|
@@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including:
|
|
41
41
|
* `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
|
42
42
|
removes `<p>` tags that contain only images;
|
43
43
|
* `:attributes`: whitelist of allowed attributes;
|
44
|
-
* `:debug`: provide debugging output, defaults false;
|
44
|
+
* `:debug`: provide debugging output, defaults false; supports setting a Proc;
|
45
45
|
* `:encoding`: if the page is of a known encoding, you can specify it; if left
|
46
46
|
unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
|
47
47
|
to disable guessing, supply `:do_not_guess_encoding => true`;
|
@@ -78,6 +78,7 @@ feature requires that the `fastimage` gem be installed.
|
|
78
78
|
Related Projects
|
79
79
|
----------------
|
80
80
|
|
81
|
+
* [readability.cr](https://github.com/joenas/readability.cr) - Port of ruby-readability's port of arc90's readability project to Crystal
|
81
82
|
* [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
|
82
83
|
|
83
84
|
Potential Issues
|
@@ -102,7 +103,3 @@ License
|
|
102
103
|
This code is under the Apache License 2.0. See <http://www.apache.org/licenses/LICENSE-2.0>.
|
103
104
|
|
104
105
|
Ruby port by cantino, starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
|
105
|
-
|
106
|
-
|
107
|
-
[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/cantino/ruby-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
|
108
|
-
|
data/lib/readability.rb
CHANGED
@@ -17,9 +17,12 @@ module Readability
|
|
17
17
|
:min_image_height => 80,
|
18
18
|
:ignore_image_format => [],
|
19
19
|
:blacklist => nil,
|
20
|
-
:whitelist => nil
|
20
|
+
:whitelist => nil,
|
21
|
+
:elements_to_score => ["p", "td", "pre"],
|
22
|
+
:likely_siblings => ["p"],
|
23
|
+
:ignore_redundant_nesting => false
|
21
24
|
}.freeze
|
22
|
-
|
25
|
+
|
23
26
|
REGEXES = {
|
24
27
|
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
25
28
|
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
@@ -33,7 +36,7 @@ module Readability
|
|
33
36
|
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
34
37
|
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
35
38
|
}
|
36
|
-
|
39
|
+
|
37
40
|
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
|
38
41
|
|
39
42
|
def initialize(input, options = {})
|
@@ -48,7 +51,7 @@ module Readability
|
|
48
51
|
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
|
49
52
|
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
|
50
53
|
@weight_classes = @options[:weight_classes]
|
51
|
-
@clean_conditionally =
|
54
|
+
@clean_conditionally = !!@options[:clean_conditionally]
|
52
55
|
@best_candidate_has_image = true
|
53
56
|
make_html
|
54
57
|
handle_exclusions!(@options[:whitelist], @options[:blacklist])
|
@@ -143,11 +146,11 @@ module Readability
|
|
143
146
|
|
144
147
|
(list_images.empty? and content != @html) ? images(@html, true) : list_images
|
145
148
|
end
|
146
|
-
|
149
|
+
|
147
150
|
def images_with_fqdn_uris!(source_uri)
|
148
151
|
images_with_fqdn_uris(@html, source_uri)
|
149
152
|
end
|
150
|
-
|
153
|
+
|
151
154
|
def images_with_fqdn_uris(document = @html.dup, source_uri)
|
152
155
|
uri = URI.parse(source_uri)
|
153
156
|
host = uri.host
|
@@ -159,7 +162,7 @@ module Readability
|
|
159
162
|
images = []
|
160
163
|
document.css("img").each do |elem|
|
161
164
|
begin
|
162
|
-
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
165
|
+
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
163
166
|
images << elem['src'].to_s
|
164
167
|
rescue URI::InvalidURIError => exc
|
165
168
|
elem.remove
|
@@ -260,15 +263,27 @@ module Readability
|
|
260
263
|
# Things like preambles, content split by ads that we removed, etc.
|
261
264
|
|
262
265
|
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
266
|
+
downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
|
263
267
|
output = Nokogiri::XML::Node.new('div', @html)
|
264
|
-
|
268
|
+
|
269
|
+
# If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
|
270
|
+
# find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
|
271
|
+
# related content detection, but could lead to false positives. Not supported in arc90's readability.
|
272
|
+
node =
|
273
|
+
if options[:ignore_redundant_nesting]
|
274
|
+
closest_node_with_siblings(best_candidate[:elem])
|
275
|
+
else
|
276
|
+
best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
|
277
|
+
end
|
278
|
+
|
279
|
+
node.parent.children.each do |sibling|
|
265
280
|
append = false
|
266
|
-
append = true if sibling ==
|
281
|
+
append = true if sibling == node
|
267
282
|
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
268
283
|
|
269
|
-
if sibling.name.downcase
|
284
|
+
if downcased_likely_siblings.include?(sibling.name.downcase)
|
270
285
|
link_density = get_link_density(sibling)
|
271
|
-
node_content = sibling.text
|
286
|
+
node_content = sibling.text.strip
|
272
287
|
node_length = node_content.length
|
273
288
|
|
274
289
|
append = if node_length > 80 && link_density < 0.25
|
@@ -288,6 +303,23 @@ module Readability
|
|
288
303
|
output
|
289
304
|
end
|
290
305
|
|
306
|
+
def closest_node_with_siblings(element)
|
307
|
+
node = element
|
308
|
+
|
309
|
+
until node.node_name == 'body'
|
310
|
+
siblings = node.parent.children
|
311
|
+
non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }
|
312
|
+
|
313
|
+
if non_empty.size > 1
|
314
|
+
return node
|
315
|
+
else
|
316
|
+
node = node.parent
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
node
|
321
|
+
end
|
322
|
+
|
291
323
|
def select_best_candidate(candidates)
|
292
324
|
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
293
325
|
|
@@ -310,7 +342,7 @@ module Readability
|
|
310
342
|
|
311
343
|
def score_paragraphs(min_text_length)
|
312
344
|
candidates = {}
|
313
|
-
@html.css(
|
345
|
+
@html.css(options[:elements_to_score].join(',')).each do |elem|
|
314
346
|
parent_node = elem.parent
|
315
347
|
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
|
316
348
|
inner_text = elem.text
|
@@ -369,7 +401,11 @@ module Readability
|
|
369
401
|
end
|
370
402
|
|
371
403
|
def debug(str)
|
372
|
-
|
404
|
+
if options[:debug].respond_to?(:call)
|
405
|
+
options[:debug].call(str)
|
406
|
+
elsif options[:debug]
|
407
|
+
puts str
|
408
|
+
end
|
373
409
|
end
|
374
410
|
|
375
411
|
def remove_unlikely_candidates!
|
@@ -423,6 +459,9 @@ module Readability
|
|
423
459
|
|
424
460
|
# We'll sanitize all elements using a whitelist
|
425
461
|
base_whitelist = @options[:tags] || %w[div p]
|
462
|
+
all_tags_whitelisted = base_whitelist.include?("*")
|
463
|
+
all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
|
464
|
+
|
426
465
|
# We'll add whitespace instead of block elements,
|
427
466
|
# so a<br>b will have a nice space between them
|
428
467
|
base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
|
@@ -435,8 +474,8 @@ module Readability
|
|
435
474
|
|
436
475
|
([node] + node.css("*")).each do |el|
|
437
476
|
# If element is in whitelist, delete all its attributes
|
438
|
-
if whitelist[el.node_name]
|
439
|
-
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
477
|
+
if all_tags_whitelisted || whitelist[el.node_name]
|
478
|
+
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
|
440
479
|
|
441
480
|
# Otherwise, replace the element with its contents
|
442
481
|
else
|
@@ -465,30 +504,43 @@ module Readability
|
|
465
504
|
|
466
505
|
def clean_conditionally(node, candidates, selector)
|
467
506
|
return unless @clean_conditionally
|
507
|
+
|
468
508
|
node.css(selector).each do |el|
|
469
509
|
weight = class_weight(el)
|
470
510
|
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
471
511
|
name = el.name.downcase
|
472
|
-
|
512
|
+
remove = false
|
513
|
+
message = nil
|
514
|
+
|
473
515
|
if weight + content_score < 0
|
474
|
-
|
475
|
-
|
516
|
+
remove = true
|
517
|
+
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
|
476
518
|
elsif el.text.count(",") < 10
|
477
519
|
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
478
520
|
counts["li"] -= 100
|
479
521
|
|
480
522
|
# For every img under a noscript tag discount one from the count to avoid double counting
|
481
523
|
counts["img"] -= el.css("noscript").css("img").length
|
482
|
-
|
524
|
+
|
483
525
|
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
|
484
526
|
link_density = get_link_density(el)
|
485
527
|
|
486
528
|
reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
|
487
529
|
if reason
|
488
|
-
|
489
|
-
|
530
|
+
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
|
531
|
+
remove = true
|
490
532
|
end
|
491
533
|
end
|
534
|
+
|
535
|
+
if options[:clean_conditionally].respond_to?(:call)
|
536
|
+
context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
|
537
|
+
remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
|
538
|
+
end
|
539
|
+
|
540
|
+
if remove
|
541
|
+
debug(message || "Conditionally cleaned by user-specified function.")
|
542
|
+
el.remove
|
543
|
+
end
|
492
544
|
end
|
493
545
|
end
|
494
546
|
|
data/ruby-readability.gemspec
CHANGED
@@ -3,15 +3,13 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "ruby-readability"
|
6
|
-
s.version = '0.7.
|
6
|
+
s.version = '0.7.2'
|
7
7
|
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
|
8
8
|
s.email = ["andrew@iterationlabs.com"]
|
9
9
|
s.homepage = "http://github.com/cantino/ruby-readability"
|
10
10
|
s.summary = %q{Port of arc90's readability project to ruby}
|
11
11
|
s.description = %q{Port of arc90's readability project to ruby}
|
12
12
|
|
13
|
-
s.rubyforge_project = "ruby-readability"
|
14
|
-
|
15
13
|
s.files = `git ls-files`.split("\n")
|
16
14
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
17
15
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
@@ -19,7 +17,6 @@ Gem::Specification.new do |s|
|
|
19
17
|
|
20
18
|
s.add_development_dependency "rspec", ">= 2.8"
|
21
19
|
s.add_development_dependency "rspec-expectations", ">= 2.8"
|
22
|
-
s.add_development_dependency "rr", ">= 1.0"
|
23
20
|
s.add_dependency 'nokogiri', '>= 1.6.0'
|
24
21
|
s.add_dependency 'guess_html_encoding', '>= 0.0.4'
|
25
22
|
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8"/>
|
5
|
+
<title>Standard Flavored Markdown</title>
|
6
|
+
<meta name="description" content=""/>
|
7
|
+
<meta name="HandheldFriendly" content="True"/>
|
8
|
+
<meta name="MobileOptimized" content="320"/>
|
9
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
10
|
+
<link rel="shortcut icon" href="/assets/images/favicon.ico?v=8684b6a35e">
|
11
|
+
<link rel="apple-touch-icon" href="/assets/images/codinghorror-app-icon.png?v=8684b6a35e">
|
12
|
+
<meta name="google-site-verification" content="sl0m9SU_4V0JcvjWlOX4dUFBR6VS2P4tlxjJMo0gphU"/>
|
13
|
+
<link rel="stylesheet" type="text/css" href="/assets/css/screen.css?v=8684b6a35e"/>
|
14
|
+
<link rel="stylesheet" type="text/css" href="//fonts.googleapis.com/css?family=Open+Sans:400italic,700italic,400,700"/>
|
15
|
+
<link rel="alternate" type="application/rss+xml" title="Coding Horror" href="http://feeds.feedburner.com/codinghorror">
|
16
|
+
</head>
|
17
|
+
<body class="post-template">
|
18
|
+
<header class="site-head">
|
19
|
+
<div class="site-head-content">
|
20
|
+
<a class="blog-logo" href="http://blog.codinghorror.com"><img src="/assets/images/codinghorror-app-icon.png?v=8684b6a35e" alt="Coding Horror Logo" width="158" height="158"/></a>
|
21
|
+
<h1 class="blog-title"><a href="http://blog.codinghorror.com">Coding Horror</a></h1>
|
22
|
+
<h2 class="blog-description">programming and human factors</h2>
|
23
|
+
<div class="site-search">
|
24
|
+
<script>
|
25
|
+
(function() {
|
26
|
+
var cx = '016956275695630057531:lqveu9tah7y';
|
27
|
+
var gcse = document.createElement('script');
|
28
|
+
gcse.type = 'text/javascript';
|
29
|
+
gcse.async = true;
|
30
|
+
gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + '//www.google.com/cse/cse.js?cx=' + cx;
|
31
|
+
var s = document.getElementsByTagName('script')[0];
|
32
|
+
s.parentNode.insertBefore(gcse, s);
|
33
|
+
})();
|
34
|
+
</script>
|
35
|
+
<gcse:search></gcse:search>
|
36
|
+
</div>
|
37
|
+
</div>
|
38
|
+
</header>
|
39
|
+
<div class="wrap clearfix">
|
40
|
+
<div class="clearfix"></div>
|
41
|
+
<main class="content" role="main">
|
42
|
+
<article class="post">
|
43
|
+
<span class="post-meta"><time datetime="2014-09-03">03 Sep 2014</time> </span>
|
44
|
+
<h1 class="post-title">Standard Flavored Markdown</h1>
|
45
|
+
<section class="post-content">
|
46
|
+
<p>In 2009 I <a href="http://blog.codinghorror.com/responsible-open-source-code-parenting/">lamented the state of Markdown</a>:</p>
|
47
|
+
<blockquote>
|
48
|
+
<p>Right now we have the worst of both worlds. Lack of leadership from the top, and a bunch of fragmented, poorly coordinated community efforts to advance Markdown, none of which are officially canon. This isn't merely incovenient for anyone trying to find accurate information about Markdown; it's actually harming the project's future. </p>
|
49
|
+
</blockquote>
|
50
|
+
<p>In late 2012, David Greenspan from <a href="https://www.meteor.com/">Meteor</a> approached me and proposed we move forward, and <a href="http://blog.codinghorror.com/the-future-of-markdown/">a project crystallized</a>:</p>
|
51
|
+
<blockquote>
|
52
|
+
<p>I propose that Stack Exchange, GitHub, Meteor, Reddit, and any other company with lots of traffic and a strategic investment in Markdown, all work together to <strong>come up with an official Markdown specification, and standard test suites to validate Markdown implementations</strong>. We've all been working at cross purposes for too long, accidentally fragmenting Markdown while popularizing it.</p>
|
53
|
+
</blockquote>
|
54
|
+
<p>We formed a small private working group with key representatives from GitHub, from Reddit, from Stack Exchange, from the open source community. We spent months hashing out the details and agreeing on the necessary changes to turn Markdown into a language you can parse without feeling like you just walked through a sewer – while preserving the simple, clear, ASCII email inspired spirit of Markdown.</p>
|
55
|
+
<p>We really struggled with this at <a href="http://www.discourse.org">Discourse</a>, which is also based on Markdown, but an even more complex dialect than the one we built at Stack Overflow. In Discourse, you can mix <em>three</em> forms of markup interchangeably:</p>
|
56
|
+
<ul>
|
57
|
+
<li>Markdown</li>
|
58
|
+
<li>HTML (safe subset)</li>
|
59
|
+
<li>BBCode (subset)</li>
|
60
|
+
</ul>
|
61
|
+
<p>Discourse is primarily a JavaScript app, so naturally we needed a nice, compliant implementation of Markdown in JavaScript. Surely such a thing exists, yes? Nope. Even in 2012, we found <em>zero</em> JavaScript implementations of Markdown that could pass the only Markdown test suite I know of, <a href="https://github.com/michelf/mdtest/">MDTest</a>. It isn't authoritative, it's a community created initiative that embodies its own decisions about rendering ambiguities in Markdown, but it's all we've got. We contributed many <a href="https://github.com/evilstreak/markdown-js/commits/master">upstream fixes to markdown.js</a> to make it pass MDTest – but it still only passes in our locally extended version.</p>
|
62
|
+
<p>As an open source project ourselves, we're perfectly happy contributing upstream code to improve it for everyone. But it's an indictment of the state of the Markdown ecosystem that any remotely popular implementation wasn't already testing itself against a formal spec and test suite. But who can blame them, because <i>it didn't exist!</i></p>
|
63
|
+
<p>Well, now it does.</p>
|
64
|
+
<p>It took a while, but I'm pleased to announce that <a href="http://standardmarkdown.com"><strong>Standard Markdown</strong></a> is now finally ready for public review.</p>
|
65
|
+
<p><strong><a href="http://standardmarkdown.com">standardmarkdown.com</a></strong></p>
|
66
|
+
<p>It's a spec, including embedded examples, and implementations in portable C and JavaScript. We strived mightily to stay true to the spirit of Markdown in writing it. The primary author, John MacFarlane, <a href="http://spec.standardmarkdown.com">explains in the introduction to the spec</a>:</p>
|
67
|
+
<blockquote>
|
68
|
+
<p>Because Gruber’s syntax description leaves many aspects of the syntax undetermined, writing a precise spec requires making a large number of decisions, many of them somewhat arbitrary. In making them, I have appealed to existing conventions and considerations of simplicity, readability, expressive power, and consistency. I have tried to ensure that “normal” documents in the many incompatible existing implementations of markdown will render, as far as possible, as their authors intended. And I have tried to make the rules for different elements work together harmoniously. In places where different decisions could have been made (for example, the rules governing list indentation), I have explained the rationale for my choices. In a few cases, I have departed slightly from the canonical syntax description, in ways that I think further the goals of markdown as stated in that description.</p>
|
69
|
+
</blockquote>
|
70
|
+
<p>Part of my contribution to the project is to host the discussion / mailing list for Standard Markdown in a Discourse instance. </p>
|
71
|
+
<p><strong><a href="http://talk.standardmarkdown.com">talk.standardmarkdown.com</a></strong></p>
|
72
|
+
<p>Fortunately, Discourse itself <a href="http://blog.discourse.org/2014/08/introducing-discourse-1-0/">just reached version 1.0</a>. If the only thing Standard Markdown does is help save a few users from the continuing horror that is mailing list web UI, we all win.</p>
|
73
|
+
<p>What I'm most excited about is that we got a massive contribution from the one person who, in my mind, was the most perfect person in the world to work on this project: <a href="http://johnmacfarlane.net/">John MacFarlane</a>. He took our feedback and wrote the entire Standard Markdown spec and both implementations.</p>
|
74
|
+
<p><a href="http://johnmacfarlane.net/"><img src="/content/images/2014/Sep/JohnPinhole.jpg" alt="" title=""/></a></p>
|
75
|
+
<p>A lot of people know of John through his <a href="http://johnmacfarlane.net/pandoc/">Pandoc</a> project, which is amazing in its own right, but I found out about him because he built <a href="http://johnmacfarlane.net/babelmark2/faq.html">Babelmark</a>. I learned to refer to Babelmark extensively while working on Stack Overflow and MarkdownSharp, a C# implementation of Markdown.</p>
|
76
|
+
<p>Here's how crazy Markdown is: to decide what the "correct" behavior is, you provide sample Markdown input to 20+ different Markdown parsers … and then pray that some consensus emerges in all their output. That's what Babelmark does.</p>
|
77
|
+
<p>Consider this simple Markdown example:</p>
|
78
|
+
<pre><code># Hello there
|
79
|
+
|
80
|
+
This is a paragraph.
|
81
|
+
|
82
|
+
- one
|
83
|
+
- two
|
84
|
+
- three
|
85
|
+
- four
|
86
|
+
|
87
|
+
1. pirate
|
88
|
+
2. ninja
|
89
|
+
3. zombie
|
90
|
+
</code></pre>
|
91
|
+
<p>Just for that, I count <a href="http://johnmacfarlane.net/babelmark2/?text=%23+Hello+there%0A%0AThis+is+a+paragraph.%0A%0A-+one%0A-+two%0A-+three%0A-+four%0A%0A1.+pirate%0A2.+ninja%0A3.+zombie"><em>fifteen</em> different rendered outputs</a> from 22 different Markdown parsers.</p>
|
92
|
+
<p><a href="http://en.wikipedia.org/wiki/Tower_of_Babel"><img src="/content/images/2014/Sep/Confusion_of_Tongues.png" alt="" title=""/></a></p>
|
93
|
+
<p>In Markdown, we <em>literally</em> built a <a href="http://en.wikipedia.org/wiki/Tower_of_Babel">Tower of Babel</a>. </p>
|
94
|
+
<p>Have I mentioned that it's a good idea for a language to have a formal specification and test suites? Maybe now you can see why that is.</p>
|
95
|
+
<p>Oh, and in his spare time, John is also the chair of the department of philosophy at the University of California, Berkeley. <em>No big deal.</em> While I don't mean to minimize the contributions of anyone to the Standard Markdown project, we all owe a special thanks to John.</p>
|
96
|
+
<p>Markdown is indeed everywhere. And that's a good thing. But it needs to be sane, parseable, and standard. That's the goal of <a href="http://standardmarkdown.com/">Standard Markdown</a> — but we need your help to get there. If you use Markdown on a website, <strong>ask what it would take for that site to become compatible with Standard Markdown</strong>; when you see the word "Markdown" you have the right to expect consistent rendering across all the websites you visit. If you implement Markdown, <a href="http://spec.standardmarkdown.com">take a look at the spec</a>, try to <strong>make your parser compatible with Standard Markdown</strong>, and <a href="http://talk.standardmarkdown.com">discuss improvements or refinements</a> to the spec.</p>
|
97
|
+
<p><span style="color:red;">Update:</span> The project was renamed <a href="http://commonmark.org">CommonMark</a>. See <a href="http://blog.codinghorror.com/standard-markdown-is-now-common-markdown/">my subsequent blog post</a>.</p>
|
98
|
+
<table>
|
99
|
+
<tr><td class="welovecodinghorror">
|
100
|
+
[advertisement] How are you showing off your awesome? Create a <a href="http://careers.stackoverflow.com/cv" rel="nofollow">Stack Overflow Careers profile</a> and show off all of your hard work from Stack Overflow, Github, and virtually every other coding site. Who knows, you might even get recruited for a great <a href="http://careers.stackoverflow.com/jobs" rel="nofollow">new position</a>!
|
101
|
+
</td></tr>
|
102
|
+
</table>
|
103
|
+
</section>
|
104
|
+
<footer class="post-footer">
|
105
|
+
<section class="author">
|
106
|
+
<h4>Written by Jeff Atwood</h4>
|
107
|
+
<p>Indoor enthusiast. Co-founder of Stack Exchange and Discourse. Disclaimer: I have no idea what I'm talking about. Find me here: <a href="http://twitter.com/codinghorror">http://twitter.com/codinghorror</a></p>
|
108
|
+
</section>
|
109
|
+
</footer>
|
110
|
+
<div id="nrelate_related_placeholder"></div> <script async id="nrelate_loader_script" type="text/javascript" src="http://static.nrelate.com/common_js/0.52.1/loader.min.js"></script>
|
111
|
+
</article>
|
112
|
+
<div id="discourse-comments"></div>
|
113
|
+
<script type="text/javascript">
|
114
|
+
var discourseUrl = "http://discourse.codinghorror.com/",
|
115
|
+
discourseEmbedUrl = 'http://blog.codinghorror.com/standard-flavored-markdown/';
|
116
|
+
|
117
|
+
(function() {
|
118
|
+
var d = document.createElement('script'); d.type = 'text/javascript'; d.async = true;
|
119
|
+
d.src = discourseUrl + 'javascripts/embed.js';
|
120
|
+
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(d);
|
121
|
+
})();
|
122
|
+
</script>
|
123
|
+
</main>
|
124
|
+
<aside class="sidebar">
|
125
|
+
|
126
|
+
<div id="carbonads-container"><div class="carbonad"><div id="azcarbon"></div><script type="text/javascript">var z = document.createElement("script"); z.type = "text/javascript"; z.async = true; z.src = "http://engine.carbonads.com/z/56742/azcarbon_2_1_0_VERT"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(z, s);</script></div></div>
|
127
|
+
<div id="hireme" class="hireme codinghorror" style="min-height: 220px; margin-bottom: 15px;"></div>
|
128
|
+
<script>
|
129
|
+
setTimeout(function () {
|
130
|
+
var a = document.createElement("script");
|
131
|
+
var b = document.getElementsByTagName('script')[0];
|
132
|
+
a.src = "http://careers.stackoverflow.com/ad/js";
|
133
|
+
a.async = true;
|
134
|
+
a.type = "text/javascript";
|
135
|
+
b.parentNode.insertBefore(a, b);
|
136
|
+
}, 5);
|
137
|
+
</script>
|
138
|
+
<div class="welovecodinghorror" style="margin-bottom:15px">
|
139
|
+
[ad] Enjoy the blog? Read <b><a href="http://www.hyperink.com/Effective-Programming-More-Than-Writing-Code-b1559">Effective Programming: More than Writing Code</a></b> and <b><a href="http://www.hyperink.com/How-To-Stop-Sucking-And-Be-Awesome-Instead-b9A74B5CBA6">How to Stop Sucking and Be Awesome Instead</a></b> on your Kindle, iPad, Nook, or as a PDF.
|
140
|
+
</div>
|
141
|
+
<h3>Resources</h3>
|
142
|
+
<ul>
|
143
|
+
<li><a href="/about-me/">About Me</a></li>
|
144
|
+
<li><a href="http://twitter.com/codinghorror">@codinghorror</a></li>
|
145
|
+
<li><a href="http://www.discourse.org/">discourse.org</a></li>
|
146
|
+
<li><a href="http://stackexchange.com/">stackexchange.com</a></li>
|
147
|
+
<li><a href="/recommended-reading-for-developers/">Recommended Reading</a></li>
|
148
|
+
</ul>
|
149
|
+
<ul>
|
150
|
+
<li><a href="http://feeds.feedburner.com/codinghorror" class="icon-feed"> Subscribe in a reader</a></li>
|
151
|
+
<li><a href="http://feedburner.google.com/fb/a/mailverify?uri=codinghorror&loc=en_US" class="icon-email"> Subscribe via email</a></li>
|
152
|
+
</ul>
|
153
|
+
<p>Coding Horror has been continuously published since 2004</p>
|
154
|
+
<ul>
|
155
|
+
<li><img src="http://feeds.feedburner.com/~fc/codinghorror?bg=EEEEEE&fg=111111&anim=0" height="26" width="88" style="border:0" alt="Count of RSS readers"></li>
|
156
|
+
<li><a href="http://my.statcounter.com/project/standard/stats.php?project_id=2600027&guest=1">Traffic Stats</a></li>
|
157
|
+
</ul>
|
158
|
+
<footer class="site-footer">
|
159
|
+
<section class="copyright">Copyright <a rel="author" href="https://profiles.google.com/codinghorror1">Jeff Atwood</a> © 2014<br/>
|
160
|
+
Logo image © 1993 Steven C. McConnell <br/>
|
161
|
+
Proudly published with <a class="icon-ghost" href="http://ghost.org">Ghost</a></section>
|
162
|
+
</footer></aside>
|
163
|
+
</div>
|
164
|
+
<script src="/public/jquery.min.js?v=8684b6a35e"></script>
|
165
|
+
<script type="text/javascript" src="/assets/js/jquery.fitvids.js?v=8684b6a35e"></script>
|
166
|
+
<script type="text/javascript" src="/assets/js/index.js?v=8684b6a35e"></script>
|
167
|
+
<script async src="http://www.statcounter.com/counter/counter.js"></script>
|
168
|
+
<noscript><a href="http://www.statcounter.com/"><img src="http://c26.statcounter.com/counter.php?sc_project=2600027&java=0&security=dcff5548&invisible=0" alt="web metrics"></a> </noscript>
|
169
|
+
|
170
|
+
<script>
|
171
|
+
document.write(unescape("%3Cscript src='" + (document.location.protocol == "https:" ? "https://sb" : "http://b") + ".scorecardresearch.com/beacon.js'%3E%3C/script%3E"));
|
172
|
+
</script>
|
173
|
+
<script>
|
174
|
+
COMSCORE.beacon({
|
175
|
+
c1: 2,
|
176
|
+
c2: "6035669",
|
177
|
+
c3: "",
|
178
|
+
c4: "http://www.codinghorror.com/blog/",
|
179
|
+
c5: "",
|
180
|
+
c6: "",
|
181
|
+
c15: ""
|
182
|
+
});
|
183
|
+
</script>
|
184
|
+
<noscript>
|
185
|
+
<img src="http://b.scorecardresearch.com/b?c1=2&c2=6035669&c3=&c4=http%3A%2F%2Fwww.codinghorror.com%2Fblog%2F&c5=&c6=&c15=&cv=1.3&cj=1" style="display:none" width="0" height="0" alt=""/>
|
186
|
+
</noscript>
|
187
|
+
<img src="/view.gif?page=/standard-flavored-markdown/" alt="" style="display:none" hidden />
|
188
|
+
</body>
|
189
|
+
</html>
|
Binary file
|
Binary file
|