upmark 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +2 -2
- data/README.md +34 -1
- data/Rakefile +1 -1
- data/lib/upmark/parser/xml.rb +3 -1
- data/lib/upmark/version.rb +1 -1
- data/spec/acceptance/upmark_spec.rb +10 -0
- metadata +9 -9
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
upmark (0.1.
|
4
|
+
upmark (0.1.3)
|
5
5
|
parslet
|
6
6
|
|
7
7
|
GEM
|
@@ -10,7 +10,7 @@ GEM
|
|
10
10
|
blankslate (2.1.2.4)
|
11
11
|
diff-lcs (1.1.3)
|
12
12
|
multi_json (1.0.3)
|
13
|
-
parslet (1.2.
|
13
|
+
parslet (1.2.3)
|
14
14
|
blankslate (~> 2.0)
|
15
15
|
rspec (2.6.0)
|
16
16
|
rspec-core (~> 2.6.0)
|
data/README.md
CHANGED
@@ -8,12 +8,45 @@ A HTML to Markdown converter.
|
|
8
8
|
|
9
9
|
## Usage
|
10
10
|
|
11
|
-
|
11
|
+
In ruby:
|
12
12
|
|
13
|
+
require "upmark"
|
13
14
|
html = %q{<p>messenger <strong>bag</strong> skateboard</p>}
|
14
15
|
markdown = Upmark.convert(html)
|
15
16
|
puts markdown
|
16
17
|
|
18
|
+
From the command-line:
|
19
|
+
|
20
|
+
> upmark foo.html
|
21
|
+
|
22
|
+
You can also pipe poorly formatted HTML documents through `tidy` before piping them into `upmark`:
|
23
|
+
|
24
|
+
> cat bar.html | tidy -asxhtml -indent -quiet --show-errors 0 --show-warnings 0 --show-body-only 1 --wrap 0 | upmark
|
25
|
+
|
26
|
+
## Features
|
27
|
+
|
28
|
+
Upmark will convert the following (arbitrarily nested) HTML elements to Markdown:
|
29
|
+
|
30
|
+
* `strong`
|
31
|
+
* `em`
|
32
|
+
* `p`
|
33
|
+
* `a`
|
34
|
+
* `h1`, `h2`, `h3`
|
35
|
+
* `ul`
|
36
|
+
* `ol`
|
37
|
+
* `br`
|
38
|
+
|
39
|
+
It will also pass through block and span-level HTML elements (e.g. `table`, `div`, `span`, etc) which aren't used by Markdown.
|
40
|
+
|
41
|
+
## How it works
|
42
|
+
|
43
|
+
Upmark defines a parsing expression grammar (PEG) using the very awesome [Parslet](http://kschiess.github.com/parslet/) gem. This PEG is then used to convert HTML into Markdown in 4 steps:
|
44
|
+
|
45
|
+
1. Parse the XHTML into an abstract syntax tree (AST).
|
46
|
+
2. Normalize the AST into a nested hash of HTML elements.
|
47
|
+
3. Mark the block and span-level subtrees which should be ignored (`table`, `div`, `span`, etc).
|
48
|
+
4. Convert the AST leaves into Markdown.
|
49
|
+
|
17
50
|
## License
|
18
51
|
|
19
52
|
Upmark is Copyright (c) 2011 The Conversation Media Group and distributed under the MIT license.
|
data/Rakefile
CHANGED
data/lib/upmark/parser/xml.rb
CHANGED
@@ -66,9 +66,11 @@ module Upmark
|
|
66
66
|
}
|
67
67
|
|
68
68
|
rule(:attribute_value) {
|
69
|
-
(match(/['"]/).absent? >> match(/[^<&]/)).repeat
|
69
|
+
(match(/['"]/).absent? >> (match(/[^<&]/) | entity_ref)).repeat
|
70
70
|
}
|
71
71
|
|
72
|
+
rule(:entity_ref) { match("&") >> name >> match(";") }
|
73
|
+
|
72
74
|
rule(:space) { match(/\s/).repeat(1) }
|
73
75
|
rule(:space?) { space.maybe }
|
74
76
|
end
|
data/lib/upmark/version.rb
CHANGED
@@ -13,6 +13,16 @@ describe Upmark, ".convert" do
|
|
13
13
|
MD
|
14
14
|
end
|
15
15
|
|
16
|
+
context "<a> hard" do
|
17
|
+
let(:html) { <<-HTML.strip }
|
18
|
+
<p><a href="http://jobs.latrobe.edu.au/jobDetails.asp?sJobIDs=545808&sKeywords=business">Manager, Business Solutions</a></p>
|
19
|
+
HTML
|
20
|
+
|
21
|
+
it { should == <<-MD.strip }
|
22
|
+
[Manager, Business Solutions](http://jobs.latrobe.edu.au/jobDetails.asp?sJobIDs=545808&sKeywords=business "")
|
23
|
+
MD
|
24
|
+
end
|
25
|
+
|
16
26
|
context "<img>" do
|
17
27
|
let(:html) { <<-HTML.strip }
|
18
28
|
<img src="http://helvetica.com/image.gif" title="art party organic" alt="messenger bag skateboard" />
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upmark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,11 +10,11 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2012-03-28 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rspec
|
17
|
-
requirement: &
|
17
|
+
requirement: &70329857124760 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ! '>='
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: '0'
|
23
23
|
type: :development
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *70329857124760
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: simplecov
|
28
|
-
requirement: &
|
28
|
+
requirement: &70329857124340 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ! '>='
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *70329857124340
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: parslet
|
39
|
-
requirement: &
|
39
|
+
requirement: &70329857123920 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ! '>='
|
@@ -44,7 +44,7 @@ dependencies:
|
|
44
44
|
version: '0'
|
45
45
|
type: :runtime
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *70329857123920
|
48
48
|
description: Upmark has the skills to convert your HTML to Markdown.
|
49
49
|
email: dev@theconversation.edu.au
|
50
50
|
executables:
|
@@ -95,7 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
95
95
|
version: '0'
|
96
96
|
requirements: []
|
97
97
|
rubyforge_project: upmark
|
98
|
-
rubygems_version: 1.8.
|
98
|
+
rubygems_version: 1.8.11
|
99
99
|
signing_key:
|
100
100
|
specification_version: 3
|
101
101
|
summary: A HTML to Markdown converter.
|