upmark 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +2 -2
- data/README.md +34 -1
- data/Rakefile +1 -1
- data/lib/upmark/parser/xml.rb +3 -1
- data/lib/upmark/version.rb +1 -1
- data/spec/acceptance/upmark_spec.rb +10 -0
- metadata +9 -9
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
upmark (0.1.
|
4
|
+
upmark (0.1.3)
|
5
5
|
parslet
|
6
6
|
|
7
7
|
GEM
|
@@ -10,7 +10,7 @@ GEM
|
|
10
10
|
blankslate (2.1.2.4)
|
11
11
|
diff-lcs (1.1.3)
|
12
12
|
multi_json (1.0.3)
|
13
|
-
parslet (1.2.
|
13
|
+
parslet (1.2.3)
|
14
14
|
blankslate (~> 2.0)
|
15
15
|
rspec (2.6.0)
|
16
16
|
rspec-core (~> 2.6.0)
|
data/README.md
CHANGED
@@ -8,12 +8,45 @@ A HTML to Markdown converter.
|
|
8
8
|
|
9
9
|
## Usage
|
10
10
|
|
11
|
-
|
11
|
+
In ruby:
|
12
12
|
|
13
|
+
require "upmark"
|
13
14
|
html = %q{<p>messenger <strong>bag</strong> skateboard</p>}
|
14
15
|
markdown = Upmark.convert(html)
|
15
16
|
puts markdown
|
16
17
|
|
18
|
+
From the command-line:
|
19
|
+
|
20
|
+
> upmark foo.html
|
21
|
+
|
22
|
+
You can also pipe poorly formatted HTML documents through `tidy` before piping them into `upmark`:
|
23
|
+
|
24
|
+
> cat bar.html | tidy -asxhtml -indent -quiet --show-errors 0 --show-warnings 0 --show-body-only 1 --wrap 0 | upmark
|
25
|
+
|
26
|
+
## Features
|
27
|
+
|
28
|
+
Upmark will convert the following (arbitrarily nested) HTML elements to Markdown:
|
29
|
+
|
30
|
+
* `strong`
|
31
|
+
* `em`
|
32
|
+
* `p`
|
33
|
+
* `a`
|
34
|
+
* `h1`, `h2`, `h3`
|
35
|
+
* `ul`
|
36
|
+
* `ol`
|
37
|
+
* `br`
|
38
|
+
|
39
|
+
It will also pass through block and span-level HTML elements (e.g. `table`, `div`, `span`, etc) which aren't used by Markdown.
|
40
|
+
|
41
|
+
## How it works
|
42
|
+
|
43
|
+
Upmark defines a parsing expression grammar (PEG) using the very awesome [Parslet](http://kschiess.github.com/parslet/) gem. This PEG is then used to convert HTML into Markdown in 4 steps:
|
44
|
+
|
45
|
+
1. Parse the XHTML into an abstract syntax tree (AST).
|
46
|
+
2. Normalize the AST into a nested hash of HTML elements.
|
47
|
+
3. Mark the block and span-level subtrees which should be ignored (`table`, `div`, `span`, etc).
|
48
|
+
4. Convert the AST leaves into Markdown.
|
49
|
+
|
17
50
|
## License
|
18
51
|
|
19
52
|
Upmark is Copyright (c) 2011 The Conversation Media Group and distributed under the MIT license.
|
data/Rakefile
CHANGED
data/lib/upmark/parser/xml.rb
CHANGED
@@ -66,9 +66,11 @@ module Upmark
|
|
66
66
|
}
|
67
67
|
|
68
68
|
rule(:attribute_value) {
|
69
|
-
(match(/['"]/).absent? >> match(/[^<&]/)).repeat
|
69
|
+
(match(/['"]/).absent? >> (match(/[^<&]/) | entity_ref)).repeat
|
70
70
|
}
|
71
71
|
|
72
|
+
rule(:entity_ref) { match("&") >> name >> match(";") }
|
73
|
+
|
72
74
|
rule(:space) { match(/\s/).repeat(1) }
|
73
75
|
rule(:space?) { space.maybe }
|
74
76
|
end
|
data/lib/upmark/version.rb
CHANGED
@@ -13,6 +13,16 @@ describe Upmark, ".convert" do
|
|
13
13
|
MD
|
14
14
|
end
|
15
15
|
|
16
|
+
context "<a> hard" do
|
17
|
+
let(:html) { <<-HTML.strip }
|
18
|
+
<p><a href="http://jobs.latrobe.edu.au/jobDetails.asp?sJobIDs=545808&sKeywords=business">Manager, Business Solutions</a></p>
|
19
|
+
HTML
|
20
|
+
|
21
|
+
it { should == <<-MD.strip }
|
22
|
+
[Manager, Business Solutions](http://jobs.latrobe.edu.au/jobDetails.asp?sJobIDs=545808&sKeywords=business "")
|
23
|
+
MD
|
24
|
+
end
|
25
|
+
|
16
26
|
context "<img>" do
|
17
27
|
let(:html) { <<-HTML.strip }
|
18
28
|
<img src="http://helvetica.com/image.gif" title="art party organic" alt="messenger bag skateboard" />
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upmark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,11 +10,11 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2012-03-28 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rspec
|
17
|
-
requirement: &
|
17
|
+
requirement: &70329857124760 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ! '>='
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: '0'
|
23
23
|
type: :development
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *70329857124760
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: simplecov
|
28
|
-
requirement: &
|
28
|
+
requirement: &70329857124340 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ! '>='
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *70329857124340
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: parslet
|
39
|
-
requirement: &
|
39
|
+
requirement: &70329857123920 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ! '>='
|
@@ -44,7 +44,7 @@ dependencies:
|
|
44
44
|
version: '0'
|
45
45
|
type: :runtime
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *70329857123920
|
48
48
|
description: Upmark has the skills to convert your HTML to Markdown.
|
49
49
|
email: dev@theconversation.edu.au
|
50
50
|
executables:
|
@@ -95,7 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
95
95
|
version: '0'
|
96
96
|
requirements: []
|
97
97
|
rubyforge_project: upmark
|
98
|
-
rubygems_version: 1.8.
|
98
|
+
rubygems_version: 1.8.11
|
99
99
|
signing_key:
|
100
100
|
specification_version: 3
|
101
101
|
summary: A HTML to Markdown converter.
|