habari2md 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +2 -0
- data.tar.gz.sig +0 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +14 -0
- data/README.md +64 -0
- data/Rakefile +1 -0
- data/bin/habari2md +44 -0
- data/certs/oz.pem +20 -0
- data/habari2md.gemspec +29 -0
- data/lib/habari2md.rb +164 -0
- data/lib/habari2md/version.rb +3 -0
- data/lib/vendor/html2text.py +914 -0
- metadata +148 -0
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c2fb0d61a3de12d336d24ab9b5d3e47ec7b9a098
|
4
|
+
data.tar.gz: 063bedd9049364662e8e200efbf4a2a1c97a9f46
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 17550bf8808495e447d04fe9f5475e170a28f8119319b89a5099aad97da5569b1d7a95c4cb3e7961c742536ca5851a73939053f25a1ef85c3ca255b6597fe907
|
7
|
+
data.tar.gz: 5c3a0c209e76eceb05092ed68e9fa1ab555f2bb63172357f70d068657f1e6094caa56e637916452f30fd9a0bdaa726708034cc45f6fb20a3d44024b292fa1ecb
|
checksums.yaml.gz.sig
ADDED
data.tar.gz.sig
ADDED
Binary file
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Copyright (c) 2014 Arnaud Berthomier
|
2
|
+
|
3
|
+
habari2md is free software: you can redistribute it and/or modify it
|
4
|
+
under the terms of the GNU General Public License as published by the
|
5
|
+
Free Software Foundation, either version 3 of the License, or (at your
|
6
|
+
option) any later version.
|
7
|
+
|
8
|
+
habari2md is distributed in the hope that it will be useful, but WITHOUT
|
9
|
+
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
10
|
+
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
11
|
+
for more details.
|
12
|
+
|
13
|
+
You should have received a copy of the GNU General Public License
|
14
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
data/README.md
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# Habari2md
|
2
|
+
|
3
|
+
This is a dirty little Ruby program to export a [Habari][habari] blog to
|
4
|
+
markdown format. I used it to avoid installing PHP on a small VPS in order to
|
5
|
+
run a tiny blog of ~2000 posts.
|
6
|
+
|
7
|
+
The program makes a few assumptions about your setup, and this conditions what
|
8
|
+
you should expect to get from it.
|
9
|
+
|
10
|
+
* It will connect to a MariaDB/MySQL database,
|
11
|
+
* fetch all of its posts and:
|
12
|
+
* dump one file per published post in the `out` directory ;
|
13
|
+
* use a filename like `YYYY-MM-DD-post-slug.md` where `YYYY-MM-DD` are the
|
14
|
+
year, month, and month day when a particular post was published ;
|
15
|
+
* and format a post header with:
|
16
|
+
|
17
|
+
```
|
18
|
+
title: The original post title
|
19
|
+
author: The author's username
|
20
|
+
```
|
21
|
+
|
22
|
+
This process can be pretty specific, and if it does not fit your setup, feel
|
23
|
+
free to file an issue or, better, send a pull-request. ;)
|
24
|
+
|
25
|
+
# Dependencies
|
26
|
+
|
27
|
+
* Ruby >= 1.9
|
28
|
+
* Python >= 2.x
|
29
|
+
|
30
|
+
# Installation
|
31
|
+
|
32
|
+
`gem install habari2md`
|
33
|
+
|
34
|
+
# Usage
|
35
|
+
|
36
|
+
```
|
37
|
+
$ habari2md -h
|
38
|
+
Usage: habari2md [options]
|
39
|
+
-o, --output [DIR] Output directory
|
40
|
+
-s, --host [HOST] Database host
|
41
|
+
-d, --db [DB] Database name
|
42
|
+
-u, --user [USER] Database user
|
43
|
+
-p, --password [PASS] Database password
|
44
|
+
-h, --help Show this message
|
45
|
+
$ habari2md -o foobar -d my_blog_database -h localhost -u sql_user -p sql_password
|
46
|
+
I, [2014-01-08T23:31:20.771303 #74090] INFO -- : Exporting 12345 posts...
|
47
|
+
I, [2014-01-08T23:31:50.618731 #74090] INFO -- : 50% to go
|
48
|
+
I, [2014-01-08T23:32:20.081583 #74090] INFO -- : We're done.
|
49
|
+
D, [2014-01-08T23:32:20.083582 #74090] DEBUG -- : Terminating 6 actors...
|
50
|
+
W, [2014-01-08T23:32:20.084398 #74090] WARN -- : Terminating task: type=:finalizer, meta={:method_name=>:__shutdown__}, status=:callwait
|
51
|
+
➜
|
52
|
+
```
|
53
|
+
|
54
|
+
# License
|
55
|
+
|
56
|
+
GPL 3.0
|
57
|
+
|
58
|
+
Note: this distribution contains Aaron Swartz [html2text][html2text] GPL
|
59
|
+
licensed program. As a matter of fact, we fork one process to convert each post
|
60
|
+
from HTML to [Markdown][markdown], yay!
|
61
|
+
|
62
|
+
[habari]: http://habariproject.org/
|
63
|
+
[html2text]: http://www.aaronsw.com/2002/html2text/
|
64
|
+
[markdown]: http://daringfireball.net/projects/markdown/
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/habari2md
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift(File.dirname(__FILE__) + '../lib')
|
4
|
+
|
5
|
+
require 'optparse'
|
6
|
+
require 'habari2md'
|
7
|
+
|
8
|
+
options = {
|
9
|
+
out: './out',
|
10
|
+
host: 'localhost',
|
11
|
+
user: 'root',
|
12
|
+
password: 'root',
|
13
|
+
}
|
14
|
+
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = 'Usage: habari2md [options]'
|
17
|
+
|
18
|
+
opts.on("-o", "--output [DIR]", "Output directory") do |dir|
|
19
|
+
options[:out] = dir
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on("-s", "--host [HOST]", "Database host") do |host|
|
23
|
+
options[:host] = host
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on("-d", "--db [DB]", "Database name") do |name|
|
27
|
+
options[:db] = name
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on("-u", "--user [USER]", "Database user") do |user|
|
31
|
+
options[:user] = user
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-p", "--password [PASS]", "Database password") do |pass|
|
35
|
+
options[:password] = pass
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-h", "--help", "Show this message") do
|
39
|
+
puts opts
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
end.parse!
|
43
|
+
|
44
|
+
Habari2md::Exporter.new(options).export_posts(options[:out])
|
data/certs/oz.pem
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
-----BEGIN CERTIFICATE-----
|
2
|
+
MIIDXDCCAkSgAwIBAgIBATANBgkqhkiG9w0BAQUFADA6MQswCQYDVQQDDAJvejEW
|
3
|
+
MBQGCgmSJomT8ixkARkWBmN5cHJpbzETMBEGCgmSJomT8ixkARkWA25ldDAeFw0x
|
4
|
+
NDAxMDkxNzE0NDVaFw0xNTAxMDkxNzE0NDVaMDoxCzAJBgNVBAMMAm96MRYwFAYK
|
5
|
+
CZImiZPyLGQBGRYGY3lwcmlvMRMwEQYKCZImiZPyLGQBGRYDbmV0MIIBIjANBgkq
|
6
|
+
hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy10tbHJlv/nomAnN23gT/9WF0Sfr/6/L
|
7
|
+
o8rkkmtFgI4gZKpY3RmmhJavlzw7Pq3hT50AN+gpacyS6GJ6NRhyR59T7EK0Mar0
|
8
|
+
7vCJhwW8EqjCjI2LVlv5NgJsQE9aFaNvNAl8cMuuWSw3UArB2ZRKsdE1J4KBTBpw
|
9
|
+
7agSPppFarNuHKyAXXsg2rfBmkDvfUKXE+8BccQ3ga1guhfFTAQgk8zLjE21opti
|
10
|
+
7qZbWToBSsV6dzBxpIWVkIcX2HnXsrpE1IJbXBzy60L5kHchzn+o2BB7wemBSMvk
|
11
|
+
yOaC2KRI5Xiy/THIZhheKGAHMvbu7xbz3Wt12J+H5iRBmE+VV/IRvwIDAQABo20w
|
12
|
+
azAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUP61Rx/1umQ17mKwZ
|
13
|
+
nGNam5fTDbMwGAYDVR0RBBEwD4ENb3pAY3lwcmlvLm5ldDAYBgNVHRIEETAPgQ1v
|
14
|
+
ekBjeXByaW8ubmV0MA0GCSqGSIb3DQEBBQUAA4IBAQC0CN++vyu1zcmOyckEHCa1
|
15
|
+
sk579L0i2DRO7bma9t+Z8Th6WVQqXGen0YYxznSzCPqQHN650IItnDUkujVHMI/g
|
16
|
+
ctUmyPXUryOA6EqFi0l+t7QSRysxy/79rZCIRufhFbsNhbwWMwUAEmHmJ2BHO7g4
|
17
|
+
EEI8FdoHY2xWEZ1SBu0gzn0Kmi5u1I6/i3NmvKchmIK3eQcPtu0xwSuFEw7SINcu
|
18
|
+
hfXfqFqS3mCcykIEz+V7ZRcIaiQse+263YcyYSYRws3EvEQH7C7XnUF7/Y6TpwnI
|
19
|
+
QDKpCyE1PBhKqihfimirfnkLKw1ZaUY9Nd8UpOopW8pA3eUdUqo0yJe6IQ7s8LyR
|
20
|
+
-----END CERTIFICATE-----
|
data/habari2md.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'habari2md/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "habari2md"
|
8
|
+
spec.version = Habari2md::VERSION
|
9
|
+
spec.authors = ["Arnaud Berthomier"]
|
10
|
+
spec.email = ["oz@cyprio.net"]
|
11
|
+
spec.summary = %q{Habari to markdown}
|
12
|
+
spec.description = %q{Dump a Habari blog posts to Markdown format}
|
13
|
+
spec.homepage = "https://github.com/oz/habari2md"
|
14
|
+
spec.license = "GPL v3"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = []
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
spec.cert_chain = ["certs/oz.pem"]
|
21
|
+
spec.signing_key = File.expand_path("~/.ssh/gem-private_key.pem") if $0 =~ /gem\z/
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
|
26
|
+
spec.add_dependency "celluloid", "~> 0.15"
|
27
|
+
spec.add_dependency "sequel", "~> 4.5"
|
28
|
+
spec.add_dependency "mysql", "~> 2.9"
|
29
|
+
end
|
data/lib/habari2md.rb
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'fileutils'
|
3
|
+
require 'pathname'
|
4
|
+
|
5
|
+
require 'celluloid'
|
6
|
+
require 'sequel'
|
7
|
+
|
8
|
+
module Habari2md
|
9
|
+
# @class Habari2md::Text Text helpers
|
10
|
+
class Text
|
11
|
+
# Shameless snatch from Rails.
|
12
|
+
# @param [String] text
|
13
|
+
# @return [String]
|
14
|
+
def self.simple_format(text)
|
15
|
+
text = '' if text.nil?
|
16
|
+
text = text.dup
|
17
|
+
start_tag = '<p>'
|
18
|
+
text = text.to_str
|
19
|
+
text.gsub!(/\r\n?/, "\n") # \r\n and \r -> \n
|
20
|
+
text.gsub!(/\n\n+/, "</p>\n\n#{start_tag}") # 2+ newline -> paragraph
|
21
|
+
text.gsub!(/([^\n]\n)(?=[^\n])/, '\1<br />') # 1 newline -> br
|
22
|
+
text.insert(0, start_tag)
|
23
|
+
text << '</p>'
|
24
|
+
return text
|
25
|
+
end
|
26
|
+
|
27
|
+
# Fork (!) html2text.py to convert form HTML to Markdown.
|
28
|
+
#
|
29
|
+
# @param [String] content
|
30
|
+
# @reutnr [String] Markdown content
|
31
|
+
def self.html2text(content)
|
32
|
+
IO.popen(html2text_script, "r+") do |io|
|
33
|
+
io.write content
|
34
|
+
io.close_write
|
35
|
+
content = io.read
|
36
|
+
io.close_read
|
37
|
+
end
|
38
|
+
content
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
def self.html2text_script
|
44
|
+
@html2text ||= Pathname.new(File.dirname(__FILE__))
|
45
|
+
.join('vendor', 'html2text.py').to_s
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# @class Habari2md::Exporter
|
50
|
+
# @example Export stuff
|
51
|
+
# worker = Habari2md::Exporter.new(db: 'foo', user: 'root')
|
52
|
+
# worker.export_posts("./out")
|
53
|
+
class Exporter
|
54
|
+
attr_reader :db
|
55
|
+
include Celluloid
|
56
|
+
include Celluloid::Logger
|
57
|
+
|
58
|
+
def initialize(opts = {})
|
59
|
+
@db = Sequel.connect(db_uri opts)
|
60
|
+
@counter = 0
|
61
|
+
@halfway = 0
|
62
|
+
|
63
|
+
# Cache users
|
64
|
+
@users = @db[:users].all.inject({}) do |cache, user|
|
65
|
+
cache.merge!([user[:id]] => user)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def posts
|
70
|
+
db[:posts].order(:modified)
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [Hash]
|
74
|
+
def user(id)
|
75
|
+
@users.fetch(id, {})
|
76
|
+
end
|
77
|
+
|
78
|
+
def export_posts(directory)
|
79
|
+
FileUtils.mkdir_p(directory) unless File.directory?(directory)
|
80
|
+
|
81
|
+
@counter = posts.count
|
82
|
+
@halfway = @counter / 2
|
83
|
+
|
84
|
+
info "Exporting #{@counter} posts..."
|
85
|
+
|
86
|
+
pool = Habari2md::PostExporter.pool(args: [directory, current_actor])
|
87
|
+
posts.each { |post| pool.async.export(post) }
|
88
|
+
|
89
|
+
wait(:done)
|
90
|
+
info "We're done."
|
91
|
+
end
|
92
|
+
|
93
|
+
# Called by PostExport when an export operation has finished.
|
94
|
+
def post_exported(post_id)
|
95
|
+
@counter -= 1
|
96
|
+
info "50% to go" if @counter == @halfway
|
97
|
+
signal(:done) if @counter == 0
|
98
|
+
end
|
99
|
+
|
100
|
+
protected
|
101
|
+
|
102
|
+
def db_uri(opts)
|
103
|
+
"mysql://#{opts[:user]}:#{opts[:password]}@#{opts[:host]}/#{opts[:db]}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# @class Habari2md::PostExporter Export one post
|
108
|
+
class PostExporter
|
109
|
+
include Celluloid
|
110
|
+
|
111
|
+
# Output directory
|
112
|
+
attr_reader :dir
|
113
|
+
|
114
|
+
# Manager actor
|
115
|
+
attr_reader :manager
|
116
|
+
|
117
|
+
def initialize(dest_dir, manager_actor)
|
118
|
+
@dir = Pathname.new(dest_dir)
|
119
|
+
@manager = manager_actor
|
120
|
+
end
|
121
|
+
|
122
|
+
# Placeholder title for untitled posts
|
123
|
+
def untitled
|
124
|
+
"Untitled"
|
125
|
+
end
|
126
|
+
|
127
|
+
# Signal the managing actor when a post has been exported
|
128
|
+
def done(post = {})
|
129
|
+
manager.post_exported(post[:id])
|
130
|
+
end
|
131
|
+
|
132
|
+
# Export one post to disk
|
133
|
+
# @param [Hash] post
|
134
|
+
def export(post)
|
135
|
+
# Ignore deleted posts and drafts.
|
136
|
+
return done(post) unless published?(post)
|
137
|
+
|
138
|
+
author = manager.user(post[:user_id])[:username]
|
139
|
+
title = post[:title].gsub(/[\r\n]/, '')
|
140
|
+
title = untitled if title == ""
|
141
|
+
date = Time.strptime(post[:pubdate].to_s, "%s").strftime("%Y-%m-%d")
|
142
|
+
filename = dir.join("#{date}-#{post[:slug]}.md")
|
143
|
+
return done(post) if File.exists?(filename) && ENV['FORCE'] == nil
|
144
|
+
|
145
|
+
# Make sure content is at least formatted with <p> tags before
|
146
|
+
# conversion.
|
147
|
+
content = Habari2md::Text.simple_format(post[:content])
|
148
|
+
File.open(filename, 'w+') do |fh|
|
149
|
+
fh << "---\n"
|
150
|
+
fh << "title: #{title}\n"
|
151
|
+
fh << "author: #{author}\n" unless author == nil
|
152
|
+
fh << "---\n\n"
|
153
|
+
fh << Habari2md::Text.html2text(content)
|
154
|
+
end
|
155
|
+
|
156
|
+
done(post)
|
157
|
+
end
|
158
|
+
|
159
|
+
# This actually depends on the values in the poststatus table.
|
160
|
+
def published?(post)
|
161
|
+
post[:status] == 2
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,914 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
3
|
+
__version__ = "3.200.3"
|
4
|
+
__author__ = "Aaron Swartz (me@aaronsw.com)"
|
5
|
+
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
|
6
|
+
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
|
7
|
+
|
8
|
+
# TODO:
|
9
|
+
# Support decoded entities with unifiable.
|
10
|
+
|
11
|
+
try:
|
12
|
+
True
|
13
|
+
except NameError:
|
14
|
+
setattr(__builtins__, 'True', 1)
|
15
|
+
setattr(__builtins__, 'False', 0)
|
16
|
+
|
17
|
+
def has_key(x, y):
|
18
|
+
if hasattr(x, 'has_key'): return x.has_key(y)
|
19
|
+
else: return y in x
|
20
|
+
|
21
|
+
try:
|
22
|
+
import htmlentitydefs
|
23
|
+
import urlparse
|
24
|
+
import HTMLParser
|
25
|
+
except ImportError: #Python3
|
26
|
+
import html.entities as htmlentitydefs
|
27
|
+
import urllib.parse as urlparse
|
28
|
+
import html.parser as HTMLParser
|
29
|
+
try: #Python3
|
30
|
+
import urllib.request as urllib
|
31
|
+
except:
|
32
|
+
import urllib
|
33
|
+
import optparse, re, sys, codecs, types
|
34
|
+
|
35
|
+
try: from textwrap import wrap
|
36
|
+
except: pass
|
37
|
+
|
38
|
+
# Use Unicode characters instead of their ascii psuedo-replacements
|
39
|
+
UNICODE_SNOB = 0
|
40
|
+
|
41
|
+
# Escape all special characters. Output is less readable, but avoids corner case formatting issues.
|
42
|
+
ESCAPE_SNOB = 0
|
43
|
+
|
44
|
+
# Put the links after each paragraph instead of at the end.
|
45
|
+
LINKS_EACH_PARAGRAPH = 0
|
46
|
+
|
47
|
+
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
|
48
|
+
BODY_WIDTH = 78
|
49
|
+
|
50
|
+
# Don't show internal links (href="#local-anchor") -- corresponding link targets
|
51
|
+
# won't be visible in the plain text file anyway.
|
52
|
+
SKIP_INTERNAL_LINKS = True
|
53
|
+
|
54
|
+
# Use inline, rather than reference, formatting for images and links
|
55
|
+
INLINE_LINKS = True
|
56
|
+
|
57
|
+
# Number of pixels Google indents nested lists
|
58
|
+
GOOGLE_LIST_INDENT = 36
|
59
|
+
|
60
|
+
IGNORE_ANCHORS = False
|
61
|
+
IGNORE_IMAGES = False
|
62
|
+
IGNORE_EMPHASIS = False
|
63
|
+
|
64
|
+
### Entity Nonsense ###
|
65
|
+
|
66
|
+
def name2cp(k):
|
67
|
+
if k == 'apos': return ord("'")
|
68
|
+
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
|
69
|
+
return htmlentitydefs.name2codepoint[k]
|
70
|
+
else:
|
71
|
+
k = htmlentitydefs.entitydefs[k]
|
72
|
+
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
|
73
|
+
return ord(codecs.latin_1_decode(k)[0])
|
74
|
+
|
75
|
+
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
|
76
|
+
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
|
77
|
+
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
|
78
|
+
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
|
79
|
+
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
|
80
|
+
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
|
81
|
+
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
|
82
|
+
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
|
83
|
+
'lrm':'', 'rlm':''}
|
84
|
+
|
85
|
+
unifiable_n = {}
|
86
|
+
|
87
|
+
for k in unifiable.keys():
|
88
|
+
unifiable_n[name2cp(k)] = unifiable[k]
|
89
|
+
|
90
|
+
### End Entity Nonsense ###
|
91
|
+
|
92
|
+
def onlywhite(line):
|
93
|
+
"""Return true if the line does only consist of whitespace characters."""
|
94
|
+
for c in line:
|
95
|
+
if c is not ' ' and c is not ' ':
|
96
|
+
return c is ' '
|
97
|
+
return line
|
98
|
+
|
99
|
+
def hn(tag):
|
100
|
+
if tag[0] == 'h' and len(tag) == 2:
|
101
|
+
try:
|
102
|
+
n = int(tag[1])
|
103
|
+
if n in range(1, 10): return n
|
104
|
+
except ValueError: return 0
|
105
|
+
|
106
|
+
def dumb_property_dict(style):
|
107
|
+
"""returns a hash of css attributes"""
|
108
|
+
return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
|
109
|
+
|
110
|
+
def dumb_css_parser(data):
|
111
|
+
"""returns a hash of css selectors, each of which contains a hash of css attributes"""
|
112
|
+
# remove @import sentences
|
113
|
+
data += ';'
|
114
|
+
importIndex = data.find('@import')
|
115
|
+
while importIndex != -1:
|
116
|
+
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
|
117
|
+
importIndex = data.find('@import')
|
118
|
+
|
119
|
+
# parse the css. reverted from dictionary compehension in order to support older pythons
|
120
|
+
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
|
121
|
+
try:
|
122
|
+
elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
|
123
|
+
except ValueError:
|
124
|
+
elements = {} # not that important
|
125
|
+
|
126
|
+
return elements
|
127
|
+
|
128
|
+
def element_style(attrs, style_def, parent_style):
|
129
|
+
"""returns a hash of the 'final' style attributes of the element"""
|
130
|
+
style = parent_style.copy()
|
131
|
+
if 'class' in attrs:
|
132
|
+
for css_class in attrs['class'].split():
|
133
|
+
css_style = style_def['.' + css_class]
|
134
|
+
style.update(css_style)
|
135
|
+
if 'style' in attrs:
|
136
|
+
immediate_style = dumb_property_dict(attrs['style'])
|
137
|
+
style.update(immediate_style)
|
138
|
+
return style
|
139
|
+
|
140
|
+
def google_list_style(style):
|
141
|
+
"""finds out whether this is an ordered or unordered list"""
|
142
|
+
if 'list-style-type' in style:
|
143
|
+
list_style = style['list-style-type']
|
144
|
+
if list_style in ['disc', 'circle', 'square', 'none']:
|
145
|
+
return 'ul'
|
146
|
+
return 'ol'
|
147
|
+
|
148
|
+
def google_has_height(style):
|
149
|
+
"""check if the style of the element has the 'height' attribute explicitly defined"""
|
150
|
+
if 'height' in style:
|
151
|
+
return True
|
152
|
+
return False
|
153
|
+
|
154
|
+
def google_text_emphasis(style):
|
155
|
+
"""return a list of all emphasis modifiers of the element"""
|
156
|
+
emphasis = []
|
157
|
+
if 'text-decoration' in style:
|
158
|
+
emphasis.append(style['text-decoration'])
|
159
|
+
if 'font-style' in style:
|
160
|
+
emphasis.append(style['font-style'])
|
161
|
+
if 'font-weight' in style:
|
162
|
+
emphasis.append(style['font-weight'])
|
163
|
+
return emphasis
|
164
|
+
|
165
|
+
def google_fixed_width_font(style):
|
166
|
+
"""check if the css of the current element defines a fixed width font"""
|
167
|
+
font_family = ''
|
168
|
+
if 'font-family' in style:
|
169
|
+
font_family = style['font-family']
|
170
|
+
if 'Courier New' == font_family or 'Consolas' == font_family:
|
171
|
+
return True
|
172
|
+
return False
|
173
|
+
|
174
|
+
def list_numbering_start(attrs):
|
175
|
+
"""extract numbering from list element attributes"""
|
176
|
+
if 'start' in attrs:
|
177
|
+
return int(attrs['start']) - 1
|
178
|
+
else:
|
179
|
+
return 0
|
180
|
+
|
181
|
+
class HTML2Text(HTMLParser.HTMLParser):
|
182
|
+
def __init__(self, out=None, baseurl=''):
|
183
|
+
HTMLParser.HTMLParser.__init__(self)
|
184
|
+
|
185
|
+
# Config options
|
186
|
+
self.unicode_snob = UNICODE_SNOB
|
187
|
+
self.escape_snob = ESCAPE_SNOB
|
188
|
+
self.links_each_paragraph = LINKS_EACH_PARAGRAPH
|
189
|
+
self.body_width = BODY_WIDTH
|
190
|
+
self.skip_internal_links = SKIP_INTERNAL_LINKS
|
191
|
+
self.inline_links = INLINE_LINKS
|
192
|
+
self.google_list_indent = GOOGLE_LIST_INDENT
|
193
|
+
self.ignore_links = IGNORE_ANCHORS
|
194
|
+
self.ignore_images = IGNORE_IMAGES
|
195
|
+
self.ignore_emphasis = IGNORE_EMPHASIS
|
196
|
+
self.google_doc = False
|
197
|
+
self.ul_item_mark = '*'
|
198
|
+
self.emphasis_mark = '_'
|
199
|
+
self.strong_mark = '**'
|
200
|
+
|
201
|
+
if out is None:
|
202
|
+
self.out = self.outtextf
|
203
|
+
else:
|
204
|
+
self.out = out
|
205
|
+
|
206
|
+
self.outtextlist = [] # empty list to store output characters before they are "joined"
|
207
|
+
|
208
|
+
try:
|
209
|
+
self.outtext = unicode()
|
210
|
+
except NameError: # Python3
|
211
|
+
self.outtext = str()
|
212
|
+
|
213
|
+
self.quiet = 0
|
214
|
+
self.p_p = 0 # number of newline character to print before next output
|
215
|
+
self.outcount = 0
|
216
|
+
self.start = 1
|
217
|
+
self.space = 0
|
218
|
+
self.a = []
|
219
|
+
self.astack = []
|
220
|
+
self.maybe_automatic_link = None
|
221
|
+
self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
|
222
|
+
self.acount = 0
|
223
|
+
self.list = []
|
224
|
+
self.blockquote = 0
|
225
|
+
self.pre = 0
|
226
|
+
self.startpre = 0
|
227
|
+
self.code = False
|
228
|
+
self.br_toggle = ''
|
229
|
+
self.lastWasNL = 0
|
230
|
+
self.lastWasList = False
|
231
|
+
self.style = 0
|
232
|
+
self.style_def = {}
|
233
|
+
self.tag_stack = []
|
234
|
+
self.emphasis = 0
|
235
|
+
self.drop_white_space = 0
|
236
|
+
self.inheader = False
|
237
|
+
self.abbr_title = None # current abbreviation definition
|
238
|
+
self.abbr_data = None # last inner HTML (for abbr being defined)
|
239
|
+
self.abbr_list = {} # stack of abbreviations to write later
|
240
|
+
self.baseurl = baseurl
|
241
|
+
|
242
|
+
try: del unifiable_n[name2cp('nbsp')]
|
243
|
+
except KeyError: pass
|
244
|
+
unifiable['nbsp'] = ' _place_holder;'
|
245
|
+
|
246
|
+
|
247
|
+
def feed(self, data):
|
248
|
+
data = data.replace("</' + 'script>", "</ignore>")
|
249
|
+
HTMLParser.HTMLParser.feed(self, data)
|
250
|
+
|
251
|
+
def handle(self, data):
|
252
|
+
self.feed(data)
|
253
|
+
self.feed("")
|
254
|
+
return self.optwrap(self.close())
|
255
|
+
|
256
|
+
def outtextf(self, s):
|
257
|
+
self.outtextlist.append(s)
|
258
|
+
if s: self.lastWasNL = s[-1] == '\n'
|
259
|
+
|
260
|
+
def close(self):
|
261
|
+
HTMLParser.HTMLParser.close(self)
|
262
|
+
|
263
|
+
self.pbr()
|
264
|
+
self.o('', 0, 'end')
|
265
|
+
|
266
|
+
self.outtext = self.outtext.join(self.outtextlist)
|
267
|
+
if self.unicode_snob:
|
268
|
+
nbsp = unichr(name2cp('nbsp'))
|
269
|
+
else:
|
270
|
+
nbsp = u' '
|
271
|
+
self.outtext = self.outtext.replace(u' _place_holder;', nbsp)
|
272
|
+
|
273
|
+
return self.outtext
|
274
|
+
|
275
|
+
def handle_charref(self, c):
|
276
|
+
self.o(self.charref(c), 1)
|
277
|
+
|
278
|
+
def handle_entityref(self, c):
|
279
|
+
self.o(self.entityref(c), 1)
|
280
|
+
|
281
|
+
def handle_starttag(self, tag, attrs):
|
282
|
+
self.handle_tag(tag, attrs, 1)
|
283
|
+
|
284
|
+
def handle_endtag(self, tag):
|
285
|
+
self.handle_tag(tag, None, 0)
|
286
|
+
|
287
|
+
def previousIndex(self, attrs):
|
288
|
+
""" returns the index of certain set of attributes (of a link) in the
|
289
|
+
self.a list
|
290
|
+
|
291
|
+
If the set of attributes is not found, returns None
|
292
|
+
"""
|
293
|
+
if not has_key(attrs, 'href'): return None
|
294
|
+
|
295
|
+
i = -1
|
296
|
+
for a in self.a:
|
297
|
+
i += 1
|
298
|
+
match = 0
|
299
|
+
|
300
|
+
if has_key(a, 'href') and a['href'] == attrs['href']:
|
301
|
+
if has_key(a, 'title') or has_key(attrs, 'title'):
|
302
|
+
if (has_key(a, 'title') and has_key(attrs, 'title') and
|
303
|
+
a['title'] == attrs['title']):
|
304
|
+
match = True
|
305
|
+
else:
|
306
|
+
match = True
|
307
|
+
|
308
|
+
if match: return i
|
309
|
+
|
310
|
+
def drop_last(self, nLetters):
|
311
|
+
if not self.quiet:
|
312
|
+
self.outtext = self.outtext[:-nLetters]
|
313
|
+
|
314
|
+
def handle_emphasis(self, start, tag_style, parent_style):
|
315
|
+
"""handles various text emphases"""
|
316
|
+
tag_emphasis = google_text_emphasis(tag_style)
|
317
|
+
parent_emphasis = google_text_emphasis(parent_style)
|
318
|
+
|
319
|
+
# handle Google's text emphasis
|
320
|
+
strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough
|
321
|
+
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
|
322
|
+
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
|
323
|
+
fixed = google_fixed_width_font(tag_style) and not \
|
324
|
+
google_fixed_width_font(parent_style) and not self.pre
|
325
|
+
|
326
|
+
if start:
|
327
|
+
# crossed-out text must be handled before other attributes
|
328
|
+
# in order not to output qualifiers unnecessarily
|
329
|
+
if bold or italic or fixed:
|
330
|
+
self.emphasis += 1
|
331
|
+
if strikethrough:
|
332
|
+
self.quiet += 1
|
333
|
+
if italic:
|
334
|
+
self.o(self.emphasis_mark)
|
335
|
+
self.drop_white_space += 1
|
336
|
+
if bold:
|
337
|
+
self.o(self.strong_mark)
|
338
|
+
self.drop_white_space += 1
|
339
|
+
if fixed:
|
340
|
+
self.o('`')
|
341
|
+
self.drop_white_space += 1
|
342
|
+
self.code = True
|
343
|
+
else:
|
344
|
+
if bold or italic or fixed:
|
345
|
+
# there must not be whitespace before closing emphasis mark
|
346
|
+
self.emphasis -= 1
|
347
|
+
self.space = 0
|
348
|
+
self.outtext = self.outtext.rstrip()
|
349
|
+
if fixed:
|
350
|
+
if self.drop_white_space:
|
351
|
+
# empty emphasis, drop it
|
352
|
+
self.drop_last(1)
|
353
|
+
self.drop_white_space -= 1
|
354
|
+
else:
|
355
|
+
self.o('`')
|
356
|
+
self.code = False
|
357
|
+
if bold:
|
358
|
+
if self.drop_white_space:
|
359
|
+
# empty emphasis, drop it
|
360
|
+
self.drop_last(2)
|
361
|
+
self.drop_white_space -= 1
|
362
|
+
else:
|
363
|
+
self.o(self.strong_mark)
|
364
|
+
if italic:
|
365
|
+
if self.drop_white_space:
|
366
|
+
# empty emphasis, drop it
|
367
|
+
self.drop_last(1)
|
368
|
+
self.drop_white_space -= 1
|
369
|
+
else:
|
370
|
+
self.o(self.emphasis_mark)
|
371
|
+
# space is only allowed after *all* emphasis marks
|
372
|
+
if (bold or italic) and not self.emphasis:
|
373
|
+
self.o(" ")
|
374
|
+
if strikethrough:
|
375
|
+
self.quiet -= 1
|
376
|
+
|
377
|
+
def handle_tag(self, tag, attrs, start):
|
378
|
+
#attrs = fixattrs(attrs)
|
379
|
+
if attrs is None:
|
380
|
+
attrs = {}
|
381
|
+
else:
|
382
|
+
attrs = dict(attrs)
|
383
|
+
|
384
|
+
if self.google_doc:
|
385
|
+
# the attrs parameter is empty for a closing tag. in addition, we
|
386
|
+
# need the attributes of the parent nodes in order to get a
|
387
|
+
# complete style description for the current element. we assume
|
388
|
+
# that google docs export well formed html.
|
389
|
+
parent_style = {}
|
390
|
+
if start:
|
391
|
+
if self.tag_stack:
|
392
|
+
parent_style = self.tag_stack[-1][2]
|
393
|
+
tag_style = element_style(attrs, self.style_def, parent_style)
|
394
|
+
self.tag_stack.append((tag, attrs, tag_style))
|
395
|
+
else:
|
396
|
+
dummy, attrs, tag_style = self.tag_stack.pop()
|
397
|
+
if self.tag_stack:
|
398
|
+
parent_style = self.tag_stack[-1][2]
|
399
|
+
|
400
|
+
if hn(tag):
|
401
|
+
self.p()
|
402
|
+
if start:
|
403
|
+
self.inheader = True
|
404
|
+
self.o(hn(tag)*"#" + ' ')
|
405
|
+
else:
|
406
|
+
self.inheader = False
|
407
|
+
return # prevent redundant emphasis marks on headers
|
408
|
+
|
409
|
+
if tag in ['p', 'div']:
|
410
|
+
if self.google_doc:
|
411
|
+
if start and google_has_height(tag_style):
|
412
|
+
self.p()
|
413
|
+
else:
|
414
|
+
self.soft_br()
|
415
|
+
else:
|
416
|
+
self.p()
|
417
|
+
|
418
|
+
if tag == "br" and start: self.o(" \n")
|
419
|
+
|
420
|
+
if tag == "hr" and start:
|
421
|
+
self.p()
|
422
|
+
self.o("* * *")
|
423
|
+
self.p()
|
424
|
+
|
425
|
+
if tag in ["head", "style", 'script']:
|
426
|
+
if start: self.quiet += 1
|
427
|
+
else: self.quiet -= 1
|
428
|
+
|
429
|
+
if tag == "style":
|
430
|
+
if start: self.style += 1
|
431
|
+
else: self.style -= 1
|
432
|
+
|
433
|
+
if tag in ["body"]:
|
434
|
+
self.quiet = 0 # sites like 9rules.com never close <head>
|
435
|
+
|
436
|
+
if tag == "blockquote":
|
437
|
+
if start:
|
438
|
+
self.p(); self.o('> ', 0, 1); self.start = 1
|
439
|
+
self.blockquote += 1
|
440
|
+
else:
|
441
|
+
self.blockquote -= 1
|
442
|
+
self.p()
|
443
|
+
|
444
|
+
if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
|
445
|
+
if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
|
446
|
+
if tag in ['del', 'strike', 's']:
|
447
|
+
if start:
|
448
|
+
self.o("<"+tag+">")
|
449
|
+
else:
|
450
|
+
self.o("</"+tag+">")
|
451
|
+
|
452
|
+
if self.google_doc:
|
453
|
+
if not self.inheader:
|
454
|
+
# handle some font attributes, but leave headers clean
|
455
|
+
self.handle_emphasis(start, tag_style, parent_style)
|
456
|
+
|
457
|
+
if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
|
458
|
+
if tag == "abbr":
|
459
|
+
if start:
|
460
|
+
self.abbr_title = None
|
461
|
+
self.abbr_data = ''
|
462
|
+
if has_key(attrs, 'title'):
|
463
|
+
self.abbr_title = attrs['title']
|
464
|
+
else:
|
465
|
+
if self.abbr_title != None:
|
466
|
+
self.abbr_list[self.abbr_data] = self.abbr_title
|
467
|
+
self.abbr_title = None
|
468
|
+
self.abbr_data = ''
|
469
|
+
|
470
|
+
if tag == "a" and not self.ignore_links:
|
471
|
+
if start:
|
472
|
+
if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
|
473
|
+
self.astack.append(attrs)
|
474
|
+
self.maybe_automatic_link = attrs['href']
|
475
|
+
else:
|
476
|
+
self.astack.append(None)
|
477
|
+
else:
|
478
|
+
if self.astack:
|
479
|
+
a = self.astack.pop()
|
480
|
+
if self.maybe_automatic_link:
|
481
|
+
self.maybe_automatic_link = None
|
482
|
+
elif a:
|
483
|
+
if self.inline_links:
|
484
|
+
self.o("](" + escape_md(a['href']) + ")")
|
485
|
+
else:
|
486
|
+
i = self.previousIndex(a)
|
487
|
+
if i is not None:
|
488
|
+
a = self.a[i]
|
489
|
+
else:
|
490
|
+
self.acount += 1
|
491
|
+
a['count'] = self.acount
|
492
|
+
a['outcount'] = self.outcount
|
493
|
+
self.a.append(a)
|
494
|
+
self.o("][" + str(a['count']) + "]")
|
495
|
+
|
496
|
+
if tag == "img" and start and not self.ignore_images:
|
497
|
+
if has_key(attrs, 'src'):
|
498
|
+
attrs['href'] = attrs['src']
|
499
|
+
alt = attrs.get('alt', '')
|
500
|
+
self.o("![" + escape_md(alt) + "]")
|
501
|
+
|
502
|
+
if self.inline_links:
|
503
|
+
self.o("(" + escape_md(attrs['href']) + ")")
|
504
|
+
else:
|
505
|
+
i = self.previousIndex(attrs)
|
506
|
+
if i is not None:
|
507
|
+
attrs = self.a[i]
|
508
|
+
else:
|
509
|
+
self.acount += 1
|
510
|
+
attrs['count'] = self.acount
|
511
|
+
attrs['outcount'] = self.outcount
|
512
|
+
self.a.append(attrs)
|
513
|
+
self.o("[" + str(attrs['count']) + "]")
|
514
|
+
|
515
|
+
if tag == 'dl' and start: self.p()
|
516
|
+
if tag == 'dt' and not start: self.pbr()
|
517
|
+
if tag == 'dd' and start: self.o(' ')
|
518
|
+
if tag == 'dd' and not start: self.pbr()
|
519
|
+
|
520
|
+
if tag in ["ol", "ul"]:
|
521
|
+
# Google Docs create sub lists as top level lists
|
522
|
+
if (not self.list) and (not self.lastWasList):
|
523
|
+
self.p()
|
524
|
+
if start:
|
525
|
+
if self.google_doc:
|
526
|
+
list_style = google_list_style(tag_style)
|
527
|
+
else:
|
528
|
+
list_style = tag
|
529
|
+
numbering_start = list_numbering_start(attrs)
|
530
|
+
self.list.append({'name':list_style, 'num':numbering_start})
|
531
|
+
else:
|
532
|
+
if self.list: self.list.pop()
|
533
|
+
self.lastWasList = True
|
534
|
+
else:
|
535
|
+
self.lastWasList = False
|
536
|
+
|
537
|
+
if tag == 'li':
|
538
|
+
self.pbr()
|
539
|
+
if start:
|
540
|
+
if self.list: li = self.list[-1]
|
541
|
+
else: li = {'name':'ul', 'num':0}
|
542
|
+
if self.google_doc:
|
543
|
+
nest_count = self.google_nest_count(tag_style)
|
544
|
+
else:
|
545
|
+
nest_count = len(self.list)
|
546
|
+
self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
|
547
|
+
if li['name'] == "ul": self.o(self.ul_item_mark + " ")
|
548
|
+
elif li['name'] == "ol":
|
549
|
+
li['num'] += 1
|
550
|
+
self.o(str(li['num'])+". ")
|
551
|
+
self.start = 1
|
552
|
+
|
553
|
+
if tag in ["table", "tr"] and start: self.p()
|
554
|
+
if tag == 'td': self.pbr()
|
555
|
+
|
556
|
+
if tag == "pre":
|
557
|
+
if start:
|
558
|
+
self.startpre = 1
|
559
|
+
self.pre = 1
|
560
|
+
else:
|
561
|
+
self.pre = 0
|
562
|
+
self.p()
|
563
|
+
|
564
|
+
def pbr(self):
|
565
|
+
if self.p_p == 0:
|
566
|
+
self.p_p = 1
|
567
|
+
|
568
|
+
def p(self):
|
569
|
+
self.p_p = 2
|
570
|
+
|
571
|
+
def soft_br(self):
|
572
|
+
self.pbr()
|
573
|
+
self.br_toggle = ' '
|
574
|
+
|
575
|
+
def o(self, data, puredata=0, force=0):
|
576
|
+
if self.abbr_data is not None:
|
577
|
+
self.abbr_data += data
|
578
|
+
|
579
|
+
if not self.quiet:
|
580
|
+
if self.google_doc:
|
581
|
+
# prevent white space immediately after 'begin emphasis' marks ('**' and '_')
|
582
|
+
lstripped_data = data.lstrip()
|
583
|
+
if self.drop_white_space and not (self.pre or self.code):
|
584
|
+
data = lstripped_data
|
585
|
+
if lstripped_data != '':
|
586
|
+
self.drop_white_space = 0
|
587
|
+
|
588
|
+
if puredata and not self.pre:
|
589
|
+
data = re.sub('\s+', ' ', data)
|
590
|
+
if data and data[0] == ' ':
|
591
|
+
self.space = 1
|
592
|
+
data = data[1:]
|
593
|
+
if not data and not force: return
|
594
|
+
|
595
|
+
if self.startpre:
|
596
|
+
#self.out(" :") #TODO: not output when already one there
|
597
|
+
if not data.startswith("\n"): # <pre>stuff...
|
598
|
+
data = "\n" + data
|
599
|
+
|
600
|
+
bq = (">" * self.blockquote)
|
601
|
+
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
|
602
|
+
|
603
|
+
if self.pre:
|
604
|
+
if not self.list:
|
605
|
+
bq += " "
|
606
|
+
#else: list content is already partially indented
|
607
|
+
for i in xrange(len(self.list)):
|
608
|
+
bq += " "
|
609
|
+
data = data.replace("\n", "\n"+bq)
|
610
|
+
|
611
|
+
if self.startpre:
|
612
|
+
self.startpre = 0
|
613
|
+
if self.list:
|
614
|
+
data = data.lstrip("\n") # use existing initial indentation
|
615
|
+
|
616
|
+
if self.start:
|
617
|
+
self.space = 0
|
618
|
+
self.p_p = 0
|
619
|
+
self.start = 0
|
620
|
+
|
621
|
+
if force == 'end':
|
622
|
+
# It's the end.
|
623
|
+
self.p_p = 0
|
624
|
+
self.out("\n")
|
625
|
+
self.space = 0
|
626
|
+
|
627
|
+
if self.p_p:
|
628
|
+
self.out((self.br_toggle+'\n'+bq)*self.p_p)
|
629
|
+
self.space = 0
|
630
|
+
self.br_toggle = ''
|
631
|
+
|
632
|
+
if self.space:
|
633
|
+
if not self.lastWasNL: self.out(' ')
|
634
|
+
self.space = 0
|
635
|
+
|
636
|
+
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
|
637
|
+
if force == "end": self.out("\n")
|
638
|
+
|
639
|
+
newa = []
|
640
|
+
for link in self.a:
|
641
|
+
if self.outcount > link['outcount']:
|
642
|
+
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
|
643
|
+
if has_key(link, 'title'): self.out(" ("+link['title']+")")
|
644
|
+
self.out("\n")
|
645
|
+
else:
|
646
|
+
newa.append(link)
|
647
|
+
|
648
|
+
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
|
649
|
+
|
650
|
+
self.a = newa
|
651
|
+
|
652
|
+
if self.abbr_list and force == "end":
|
653
|
+
for abbr, definition in self.abbr_list.items():
|
654
|
+
self.out(" *[" + abbr + "]: " + definition + "\n")
|
655
|
+
|
656
|
+
self.p_p = 0
|
657
|
+
self.out(data)
|
658
|
+
self.outcount += 1
|
659
|
+
|
660
|
+
def handle_data(self, data):
|
661
|
+
if r'\/script>' in data: self.quiet -= 1
|
662
|
+
|
663
|
+
if self.style:
|
664
|
+
self.style_def.update(dumb_css_parser(data))
|
665
|
+
|
666
|
+
if not self.maybe_automatic_link is None:
|
667
|
+
href = self.maybe_automatic_link
|
668
|
+
if href == data and self.absolute_url_matcher.match(href):
|
669
|
+
self.o("<" + data + ">")
|
670
|
+
return
|
671
|
+
else:
|
672
|
+
self.o("[")
|
673
|
+
self.maybe_automatic_link = None
|
674
|
+
|
675
|
+
if not self.code and not self.pre:
|
676
|
+
data = escape_md_section(data, snob=self.escape_snob)
|
677
|
+
self.o(data, 1)
|
678
|
+
|
679
|
+
def unknown_decl(self, data): pass
|
680
|
+
|
681
|
+
def charref(self, name):
|
682
|
+
if name[0] in ['x','X']:
|
683
|
+
c = int(name[1:], 16)
|
684
|
+
else:
|
685
|
+
c = int(name)
|
686
|
+
|
687
|
+
if not self.unicode_snob and c in unifiable_n.keys():
|
688
|
+
return unifiable_n[c]
|
689
|
+
else:
|
690
|
+
try:
|
691
|
+
return unichr(c)
|
692
|
+
except NameError: #Python3
|
693
|
+
return chr(c)
|
694
|
+
|
695
|
+
def entityref(self, c):
|
696
|
+
if not self.unicode_snob and c in unifiable.keys():
|
697
|
+
return unifiable[c]
|
698
|
+
else:
|
699
|
+
try: name2cp(c)
|
700
|
+
except KeyError: return "&" + c + ';'
|
701
|
+
else:
|
702
|
+
try:
|
703
|
+
return unichr(name2cp(c))
|
704
|
+
except NameError: #Python3
|
705
|
+
return chr(name2cp(c))
|
706
|
+
|
707
|
+
def replaceEntities(self, s):
|
708
|
+
s = s.group(1)
|
709
|
+
if s[0] == "#":
|
710
|
+
return self.charref(s[1:])
|
711
|
+
else: return self.entityref(s)
|
712
|
+
|
713
|
+
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
|
714
|
+
def unescape(self, s):
|
715
|
+
return self.r_unescape.sub(self.replaceEntities, s)
|
716
|
+
|
717
|
+
def google_nest_count(self, style):
|
718
|
+
"""calculate the nesting count of google doc lists"""
|
719
|
+
nest_count = 0
|
720
|
+
if 'margin-left' in style:
|
721
|
+
nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
|
722
|
+
return nest_count
|
723
|
+
|
724
|
+
|
725
|
+
def optwrap(self, text):
|
726
|
+
"""Wrap all paragraphs in the provided text."""
|
727
|
+
if not self.body_width:
|
728
|
+
return text
|
729
|
+
|
730
|
+
assert wrap, "Requires Python 2.3."
|
731
|
+
result = ''
|
732
|
+
newlines = 0
|
733
|
+
for para in text.split("\n"):
|
734
|
+
if len(para) > 0:
|
735
|
+
if not skipwrap(para):
|
736
|
+
result += "\n".join(wrap(para, self.body_width))
|
737
|
+
if para.endswith(' '):
|
738
|
+
result += " \n"
|
739
|
+
newlines = 1
|
740
|
+
else:
|
741
|
+
result += "\n\n"
|
742
|
+
newlines = 2
|
743
|
+
else:
|
744
|
+
if not onlywhite(para):
|
745
|
+
result += para + "\n"
|
746
|
+
newlines = 1
|
747
|
+
else:
|
748
|
+
if newlines < 2:
|
749
|
+
result += "\n"
|
750
|
+
newlines += 1
|
751
|
+
return result
|
752
|
+
|
753
|
+
ordered_list_matcher = re.compile(r'\d+\.\s')
|
754
|
+
unordered_list_matcher = re.compile(r'[-\*\+]\s')
|
755
|
+
md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
|
756
|
+
md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
757
|
+
md_dot_matcher = re.compile(r"""
|
758
|
+
^ # start of line
|
759
|
+
(\s*\d+) # optional whitespace and a number
|
760
|
+
(\.) # dot
|
761
|
+
(?=\s) # lookahead assert whitespace
|
762
|
+
""", re.MULTILINE | re.VERBOSE)
|
763
|
+
md_plus_matcher = re.compile(r"""
|
764
|
+
^
|
765
|
+
(\s*)
|
766
|
+
(\+)
|
767
|
+
(?=\s)
|
768
|
+
""", flags=re.MULTILINE | re.VERBOSE)
|
769
|
+
md_dash_matcher = re.compile(r"""
|
770
|
+
^
|
771
|
+
(\s*)
|
772
|
+
(-)
|
773
|
+
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
|
774
|
+
# or another dash (header or hr)
|
775
|
+
""", flags=re.MULTILINE | re.VERBOSE)
|
776
|
+
slash_chars = r'\`*_{}[]()#+-.!'
|
777
|
+
md_backslash_matcher = re.compile(r'''
|
778
|
+
(\\) # match one slash
|
779
|
+
(?=[%s]) # followed by a char that requires escaping
|
780
|
+
''' % re.escape(slash_chars),
|
781
|
+
flags=re.VERBOSE)
|
782
|
+
|
783
|
+
def skipwrap(para):
|
784
|
+
# If the text begins with four spaces or one tab, it's a code block; don't wrap
|
785
|
+
if para[0:4] == ' ' or para[0] == '\t':
|
786
|
+
return True
|
787
|
+
# If the text begins with only two "--", possibly preceded by whitespace, that's
|
788
|
+
# an emdash; so wrap.
|
789
|
+
stripped = para.lstrip()
|
790
|
+
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
791
|
+
return False
|
792
|
+
# I'm not sure what this is for; I thought it was to detect lists, but there's
|
793
|
+
# a <br>-inside-<span> case in one of the tests that also depends upon it.
|
794
|
+
if stripped[0:1] == '-' or stripped[0:1] == '*':
|
795
|
+
return True
|
796
|
+
# If the text begins with a single -, *, or +, followed by a space, or an integer,
|
797
|
+
# followed by a ., followed by a space (in either case optionally preceeded by
|
798
|
+
# whitespace), it's a list; don't wrap.
|
799
|
+
if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
|
800
|
+
return True
|
801
|
+
return False
|
802
|
+
|
803
|
+
def wrapwrite(text):
|
804
|
+
text = text.encode('utf-8')
|
805
|
+
try: #Python3
|
806
|
+
sys.stdout.buffer.write(text)
|
807
|
+
except AttributeError:
|
808
|
+
sys.stdout.write(text)
|
809
|
+
|
810
|
+
def html2text(html, baseurl=''):
|
811
|
+
h = HTML2Text(baseurl=baseurl)
|
812
|
+
return h.handle(html)
|
813
|
+
|
814
|
+
def unescape(s, unicode_snob=False):
|
815
|
+
h = HTML2Text()
|
816
|
+
h.unicode_snob = unicode_snob
|
817
|
+
return h.unescape(s)
|
818
|
+
|
819
|
+
def escape_md(text):
|
820
|
+
"""Escapes markdown-sensitive characters within other markdown constructs."""
|
821
|
+
return md_chars_matcher.sub(r"\\\1", text)
|
822
|
+
|
823
|
+
def escape_md_section(text, snob=False):
|
824
|
+
"""Escapes markdown-sensitive characters across whole document sections."""
|
825
|
+
text = md_backslash_matcher.sub(r"\\\1", text)
|
826
|
+
if snob:
|
827
|
+
text = md_chars_matcher_all.sub(r"\\\1", text)
|
828
|
+
text = md_dot_matcher.sub(r"\1\\\2", text)
|
829
|
+
text = md_plus_matcher.sub(r"\1\\\2", text)
|
830
|
+
text = md_dash_matcher.sub(r"\1\\\2", text)
|
831
|
+
return text
|
832
|
+
|
833
|
+
|
834
|
+
def main():
|
835
|
+
baseurl = ''
|
836
|
+
|
837
|
+
p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
|
838
|
+
version='%prog ' + __version__)
|
839
|
+
p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
|
840
|
+
default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
|
841
|
+
p.add_option("--ignore-links", dest="ignore_links", action="store_true",
|
842
|
+
default=IGNORE_ANCHORS, help="don't include any formatting for links")
|
843
|
+
p.add_option("--ignore-images", dest="ignore_images", action="store_true",
|
844
|
+
default=IGNORE_IMAGES, help="don't include any formatting for images")
|
845
|
+
p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
|
846
|
+
default=False, help="convert an html-exported Google Document")
|
847
|
+
p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
|
848
|
+
default=False, help="use a dash rather than a star for unordered list items")
|
849
|
+
p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
|
850
|
+
default=False, help="use an asterisk rather than an underscore for emphasized text")
|
851
|
+
p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
|
852
|
+
default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
|
853
|
+
p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
|
854
|
+
default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
|
855
|
+
p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
|
856
|
+
default=False, help="hide strike-through text. only relevant when -g is specified as well")
|
857
|
+
p.add_option("--escape-all", action="store_true", dest="escape_snob",
|
858
|
+
default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
|
859
|
+
(options, args) = p.parse_args()
|
860
|
+
|
861
|
+
# process input
|
862
|
+
encoding = "utf-8"
|
863
|
+
if len(args) > 0:
|
864
|
+
file_ = args[0]
|
865
|
+
if len(args) == 2:
|
866
|
+
encoding = args[1]
|
867
|
+
if len(args) > 2:
|
868
|
+
p.error('Too many arguments')
|
869
|
+
|
870
|
+
if file_.startswith('http://') or file_.startswith('https://'):
|
871
|
+
baseurl = file_
|
872
|
+
j = urllib.urlopen(baseurl)
|
873
|
+
data = j.read()
|
874
|
+
if encoding is None:
|
875
|
+
try:
|
876
|
+
from feedparser import _getCharacterEncoding as enc
|
877
|
+
except ImportError:
|
878
|
+
enc = lambda x, y: ('utf-8', 1)
|
879
|
+
encoding = enc(j.headers, data)[0]
|
880
|
+
if encoding == 'us-ascii':
|
881
|
+
encoding = 'utf-8'
|
882
|
+
else:
|
883
|
+
data = open(file_, 'rb').read()
|
884
|
+
if encoding is None:
|
885
|
+
try:
|
886
|
+
from chardet import detect
|
887
|
+
except ImportError:
|
888
|
+
detect = lambda x: {'encoding': 'utf-8'}
|
889
|
+
encoding = detect(data)['encoding']
|
890
|
+
else:
|
891
|
+
data = sys.stdin.read()
|
892
|
+
|
893
|
+
data = data.decode(encoding)
|
894
|
+
h = HTML2Text(baseurl=baseurl)
|
895
|
+
# handle options
|
896
|
+
if options.ul_style_dash: h.ul_item_mark = '-'
|
897
|
+
if options.em_style_asterisk:
|
898
|
+
h.emphasis_mark = '*'
|
899
|
+
h.strong_mark = '__'
|
900
|
+
|
901
|
+
h.body_width = options.body_width
|
902
|
+
h.list_indent = options.list_indent
|
903
|
+
h.ignore_emphasis = options.ignore_emphasis
|
904
|
+
h.ignore_links = options.ignore_links
|
905
|
+
h.ignore_images = options.ignore_images
|
906
|
+
h.google_doc = options.google_doc
|
907
|
+
h.hide_strikethrough = options.hide_strikethrough
|
908
|
+
h.escape_snob = options.escape_snob
|
909
|
+
|
910
|
+
wrapwrite(h.handle(data))
|
911
|
+
|
912
|
+
|
913
|
+
if __name__ == "__main__":
|
914
|
+
main()
|