habari2md 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +2 -0
- data.tar.gz.sig +0 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +14 -0
- data/README.md +64 -0
- data/Rakefile +1 -0
- data/bin/habari2md +44 -0
- data/certs/oz.pem +20 -0
- data/habari2md.gemspec +29 -0
- data/lib/habari2md.rb +164 -0
- data/lib/habari2md/version.rb +3 -0
- data/lib/vendor/html2text.py +914 -0
- metadata +148 -0
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c2fb0d61a3de12d336d24ab9b5d3e47ec7b9a098
|
4
|
+
data.tar.gz: 063bedd9049364662e8e200efbf4a2a1c97a9f46
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 17550bf8808495e447d04fe9f5475e170a28f8119319b89a5099aad97da5569b1d7a95c4cb3e7961c742536ca5851a73939053f25a1ef85c3ca255b6597fe907
|
7
|
+
data.tar.gz: 5c3a0c209e76eceb05092ed68e9fa1ab555f2bb63172357f70d068657f1e6094caa56e637916452f30fd9a0bdaa726708034cc45f6fb20a3d44024b292fa1ecb
|
checksums.yaml.gz.sig
ADDED
data.tar.gz.sig
ADDED
Binary file
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Copyright (c) 2014 Arnaud Berthomier
|
2
|
+
|
3
|
+
habari2md is free software: you can redistribute it and/or modify it
|
4
|
+
under the terms of the GNU General Public License as published by the
|
5
|
+
Free Software Foundation, either version 3 of the License, or (at your
|
6
|
+
option) any later version.
|
7
|
+
|
8
|
+
habari2md is distributed in the hope that it will be useful, but WITHOUT
|
9
|
+
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
10
|
+
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
11
|
+
for more details.
|
12
|
+
|
13
|
+
You should have received a copy of the GNU General Public License
|
14
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
data/README.md
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# Habari2md
|
2
|
+
|
3
|
+
This is a dirty little Ruby program to export a [Habari][habari] blog to
|
4
|
+
markdown format. I used it to avoid installing PHP on a small VPS in order to
|
5
|
+
run a tiny blog of ~2000 posts.
|
6
|
+
|
7
|
+
The program makes a few assumptions about your setup, and this conditions what
|
8
|
+
you should expect to get from it.
|
9
|
+
|
10
|
+
* It will connect to a MariaDB/MySQL database,
|
11
|
+
* fetch all of its posts and:
|
12
|
+
* dump one file per published post in the `out` directory ;
|
13
|
+
* use a filename like `YYYY-MM-DD-post-slug.md` where `YYYY-MM-DD` are the
|
14
|
+
year, month, and month day when a particular post was published ;
|
15
|
+
* and format a post header with:
|
16
|
+
|
17
|
+
```
|
18
|
+
title: The original post title
|
19
|
+
author: The author's username
|
20
|
+
```
|
21
|
+
|
22
|
+
This process can be pretty specific, and if it does not fit your setup, feel
|
23
|
+
free to file an issue or, better, send a pull-request. ;)
|
24
|
+
|
25
|
+
# Dependencies
|
26
|
+
|
27
|
+
* Ruby >= 1.9
|
28
|
+
* Python >= 2.x
|
29
|
+
|
30
|
+
# Installation
|
31
|
+
|
32
|
+
`gem install habari2md`
|
33
|
+
|
34
|
+
# Usage
|
35
|
+
|
36
|
+
```
|
37
|
+
$ habari2md -h
|
38
|
+
Usage: habari2md [options]
|
39
|
+
-o, --output [DIR] Output directory
|
40
|
+
-s, --host [HOST] Database host
|
41
|
+
-d, --db [DB] Database name
|
42
|
+
-u, --user [USER] Database user
|
43
|
+
-p, --password [PASS] Database password
|
44
|
+
-h, --help Show this message
|
45
|
+
$ habari2md -o foobar -d my_blog_database -h localhost -u sql_user -p sql_password
|
46
|
+
I, [2014-01-08T23:31:20.771303 #74090] INFO -- : Exporting 12345 posts...
|
47
|
+
I, [2014-01-08T23:31:50.618731 #74090] INFO -- : 50% to go
|
48
|
+
I, [2014-01-08T23:32:20.081583 #74090] INFO -- : We're done.
|
49
|
+
D, [2014-01-08T23:32:20.083582 #74090] DEBUG -- : Terminating 6 actors...
|
50
|
+
W, [2014-01-08T23:32:20.084398 #74090] WARN -- : Terminating task: type=:finalizer, meta={:method_name=>:__shutdown__}, status=:callwait
|
51
|
+
➜
|
52
|
+
```
|
53
|
+
|
54
|
+
# License
|
55
|
+
|
56
|
+
GPL 3.0
|
57
|
+
|
58
|
+
Note: this distribution contains Aaron Swartz [html2text][html2text] GPL
|
59
|
+
licensed program. As a matter of fact, we fork one process to convert each post
|
60
|
+
from HTML to [Markdown][markdown], yay!
|
61
|
+
|
62
|
+
[habari]: http://habariproject.org/
|
63
|
+
[html2text]: http://www.aaronsw.com/2002/html2text/
|
64
|
+
[markdown]: http://daringfireball.net/projects/markdown/
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/habari2md
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift(File.dirname(__FILE__) + '../lib')
|
4
|
+
|
5
|
+
require 'optparse'
|
6
|
+
require 'habari2md'
|
7
|
+
|
8
|
+
options = {
|
9
|
+
out: './out',
|
10
|
+
host: 'localhost',
|
11
|
+
user: 'root',
|
12
|
+
password: 'root',
|
13
|
+
}
|
14
|
+
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = 'Usage: habari2md [options]'
|
17
|
+
|
18
|
+
opts.on("-o", "--output [DIR]", "Output directory") do |dir|
|
19
|
+
options[:out] = dir
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on("-s", "--host [HOST]", "Database host") do |host|
|
23
|
+
options[:host] = host
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on("-d", "--db [DB]", "Database name") do |name|
|
27
|
+
options[:db] = name
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on("-u", "--user [USER]", "Database user") do |user|
|
31
|
+
options[:user] = user
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-p", "--password [PASS]", "Database password") do |pass|
|
35
|
+
options[:password] = pass
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-h", "--help", "Show this message") do
|
39
|
+
puts opts
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
end.parse!
|
43
|
+
|
44
|
+
Habari2md::Exporter.new(options).export_posts(options[:out])
|
data/certs/oz.pem
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
-----BEGIN CERTIFICATE-----
|
2
|
+
MIIDXDCCAkSgAwIBAgIBATANBgkqhkiG9w0BAQUFADA6MQswCQYDVQQDDAJvejEW
|
3
|
+
MBQGCgmSJomT8ixkARkWBmN5cHJpbzETMBEGCgmSJomT8ixkARkWA25ldDAeFw0x
|
4
|
+
NDAxMDkxNzE0NDVaFw0xNTAxMDkxNzE0NDVaMDoxCzAJBgNVBAMMAm96MRYwFAYK
|
5
|
+
CZImiZPyLGQBGRYGY3lwcmlvMRMwEQYKCZImiZPyLGQBGRYDbmV0MIIBIjANBgkq
|
6
|
+
hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy10tbHJlv/nomAnN23gT/9WF0Sfr/6/L
|
7
|
+
o8rkkmtFgI4gZKpY3RmmhJavlzw7Pq3hT50AN+gpacyS6GJ6NRhyR59T7EK0Mar0
|
8
|
+
7vCJhwW8EqjCjI2LVlv5NgJsQE9aFaNvNAl8cMuuWSw3UArB2ZRKsdE1J4KBTBpw
|
9
|
+
7agSPppFarNuHKyAXXsg2rfBmkDvfUKXE+8BccQ3ga1guhfFTAQgk8zLjE21opti
|
10
|
+
7qZbWToBSsV6dzBxpIWVkIcX2HnXsrpE1IJbXBzy60L5kHchzn+o2BB7wemBSMvk
|
11
|
+
yOaC2KRI5Xiy/THIZhheKGAHMvbu7xbz3Wt12J+H5iRBmE+VV/IRvwIDAQABo20w
|
12
|
+
azAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUP61Rx/1umQ17mKwZ
|
13
|
+
nGNam5fTDbMwGAYDVR0RBBEwD4ENb3pAY3lwcmlvLm5ldDAYBgNVHRIEETAPgQ1v
|
14
|
+
ekBjeXByaW8ubmV0MA0GCSqGSIb3DQEBBQUAA4IBAQC0CN++vyu1zcmOyckEHCa1
|
15
|
+
sk579L0i2DRO7bma9t+Z8Th6WVQqXGen0YYxznSzCPqQHN650IItnDUkujVHMI/g
|
16
|
+
ctUmyPXUryOA6EqFi0l+t7QSRysxy/79rZCIRufhFbsNhbwWMwUAEmHmJ2BHO7g4
|
17
|
+
EEI8FdoHY2xWEZ1SBu0gzn0Kmi5u1I6/i3NmvKchmIK3eQcPtu0xwSuFEw7SINcu
|
18
|
+
hfXfqFqS3mCcykIEz+V7ZRcIaiQse+263YcyYSYRws3EvEQH7C7XnUF7/Y6TpwnI
|
19
|
+
QDKpCyE1PBhKqihfimirfnkLKw1ZaUY9Nd8UpOopW8pA3eUdUqo0yJe6IQ7s8LyR
|
20
|
+
-----END CERTIFICATE-----
|
data/habari2md.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'habari2md/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "habari2md"
|
8
|
+
spec.version = Habari2md::VERSION
|
9
|
+
spec.authors = ["Arnaud Berthomier"]
|
10
|
+
spec.email = ["oz@cyprio.net"]
|
11
|
+
spec.summary = %q{Habari to markdown}
|
12
|
+
spec.description = %q{Dump a Habari blog posts to Markdown format}
|
13
|
+
spec.homepage = "https://github.com/oz/habari2md"
|
14
|
+
spec.license = "GPL v3"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = []
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
spec.cert_chain = ["certs/oz.pem"]
|
21
|
+
spec.signing_key = File.expand_path("~/.ssh/gem-private_key.pem") if $0 =~ /gem\z/
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
|
26
|
+
spec.add_dependency "celluloid", "~> 0.15"
|
27
|
+
spec.add_dependency "sequel", "~> 4.5"
|
28
|
+
spec.add_dependency "mysql", "~> 2.9"
|
29
|
+
end
|
data/lib/habari2md.rb
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'fileutils'
|
3
|
+
require 'pathname'
|
4
|
+
|
5
|
+
require 'celluloid'
|
6
|
+
require 'sequel'
|
7
|
+
|
8
|
+
module Habari2md
|
9
|
+
# @class Habari2md::Text Text helpers
|
10
|
+
class Text
|
11
|
+
# Shameless snatch from Rails.
|
12
|
+
# @param [String] text
|
13
|
+
# @return [String]
|
14
|
+
def self.simple_format(text)
|
15
|
+
text = '' if text.nil?
|
16
|
+
text = text.dup
|
17
|
+
start_tag = '<p>'
|
18
|
+
text = text.to_str
|
19
|
+
text.gsub!(/\r\n?/, "\n") # \r\n and \r -> \n
|
20
|
+
text.gsub!(/\n\n+/, "</p>\n\n#{start_tag}") # 2+ newline -> paragraph
|
21
|
+
text.gsub!(/([^\n]\n)(?=[^\n])/, '\1<br />') # 1 newline -> br
|
22
|
+
text.insert(0, start_tag)
|
23
|
+
text << '</p>'
|
24
|
+
return text
|
25
|
+
end
|
26
|
+
|
27
|
+
# Fork (!) html2text.py to convert form HTML to Markdown.
|
28
|
+
#
|
29
|
+
# @param [String] content
|
30
|
+
# @reutnr [String] Markdown content
|
31
|
+
def self.html2text(content)
|
32
|
+
IO.popen(html2text_script, "r+") do |io|
|
33
|
+
io.write content
|
34
|
+
io.close_write
|
35
|
+
content = io.read
|
36
|
+
io.close_read
|
37
|
+
end
|
38
|
+
content
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
def self.html2text_script
|
44
|
+
@html2text ||= Pathname.new(File.dirname(__FILE__))
|
45
|
+
.join('vendor', 'html2text.py').to_s
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# @class Habari2md::Exporter
|
50
|
+
# @example Export stuff
|
51
|
+
# worker = Habari2md::Exporter.new(db: 'foo', user: 'root')
|
52
|
+
# worker.export_posts("./out")
|
53
|
+
class Exporter
|
54
|
+
attr_reader :db
|
55
|
+
include Celluloid
|
56
|
+
include Celluloid::Logger
|
57
|
+
|
58
|
+
def initialize(opts = {})
|
59
|
+
@db = Sequel.connect(db_uri opts)
|
60
|
+
@counter = 0
|
61
|
+
@halfway = 0
|
62
|
+
|
63
|
+
# Cache users
|
64
|
+
@users = @db[:users].all.inject({}) do |cache, user|
|
65
|
+
cache.merge!([user[:id]] => user)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def posts
|
70
|
+
db[:posts].order(:modified)
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [Hash]
|
74
|
+
def user(id)
|
75
|
+
@users.fetch(id, {})
|
76
|
+
end
|
77
|
+
|
78
|
+
def export_posts(directory)
|
79
|
+
FileUtils.mkdir_p(directory) unless File.directory?(directory)
|
80
|
+
|
81
|
+
@counter = posts.count
|
82
|
+
@halfway = @counter / 2
|
83
|
+
|
84
|
+
info "Exporting #{@counter} posts..."
|
85
|
+
|
86
|
+
pool = Habari2md::PostExporter.pool(args: [directory, current_actor])
|
87
|
+
posts.each { |post| pool.async.export(post) }
|
88
|
+
|
89
|
+
wait(:done)
|
90
|
+
info "We're done."
|
91
|
+
end
|
92
|
+
|
93
|
+
# Called by PostExport when an export operation has finished.
|
94
|
+
def post_exported(post_id)
|
95
|
+
@counter -= 1
|
96
|
+
info "50% to go" if @counter == @halfway
|
97
|
+
signal(:done) if @counter == 0
|
98
|
+
end
|
99
|
+
|
100
|
+
protected
|
101
|
+
|
102
|
+
def db_uri(opts)
|
103
|
+
"mysql://#{opts[:user]}:#{opts[:password]}@#{opts[:host]}/#{opts[:db]}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# @class Habari2md::PostExporter Export one post
|
108
|
+
class PostExporter
|
109
|
+
include Celluloid
|
110
|
+
|
111
|
+
# Output directory
|
112
|
+
attr_reader :dir
|
113
|
+
|
114
|
+
# Manager actor
|
115
|
+
attr_reader :manager
|
116
|
+
|
117
|
+
def initialize(dest_dir, manager_actor)
|
118
|
+
@dir = Pathname.new(dest_dir)
|
119
|
+
@manager = manager_actor
|
120
|
+
end
|
121
|
+
|
122
|
+
# Placeholder title for untitled posts
|
123
|
+
def untitled
|
124
|
+
"Untitled"
|
125
|
+
end
|
126
|
+
|
127
|
+
# Signal the managing actor when a post has been exported
|
128
|
+
def done(post = {})
|
129
|
+
manager.post_exported(post[:id])
|
130
|
+
end
|
131
|
+
|
132
|
+
# Export one post to disk
|
133
|
+
# @param [Hash] post
|
134
|
+
def export(post)
|
135
|
+
# Ignore deleted posts and drafts.
|
136
|
+
return done(post) unless published?(post)
|
137
|
+
|
138
|
+
author = manager.user(post[:user_id])[:username]
|
139
|
+
title = post[:title].gsub(/[\r\n]/, '')
|
140
|
+
title = untitled if title == ""
|
141
|
+
date = Time.strptime(post[:pubdate].to_s, "%s").strftime("%Y-%m-%d")
|
142
|
+
filename = dir.join("#{date}-#{post[:slug]}.md")
|
143
|
+
return done(post) if File.exists?(filename) && ENV['FORCE'] == nil
|
144
|
+
|
145
|
+
# Make sure content is at least formatted with <p> tags before
|
146
|
+
# conversion.
|
147
|
+
content = Habari2md::Text.simple_format(post[:content])
|
148
|
+
File.open(filename, 'w+') do |fh|
|
149
|
+
fh << "---\n"
|
150
|
+
fh << "title: #{title}\n"
|
151
|
+
fh << "author: #{author}\n" unless author == nil
|
152
|
+
fh << "---\n\n"
|
153
|
+
fh << Habari2md::Text.html2text(content)
|
154
|
+
end
|
155
|
+
|
156
|
+
done(post)
|
157
|
+
end
|
158
|
+
|
159
|
+
# This actually depends on the values in the poststatus table.
|
160
|
+
def published?(post)
|
161
|
+
post[:status] == 2
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,914 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
3
|
+
__version__ = "3.200.3"
|
4
|
+
__author__ = "Aaron Swartz (me@aaronsw.com)"
|
5
|
+
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
|
6
|
+
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
|
7
|
+
|
8
|
+
# TODO:
|
9
|
+
# Support decoded entities with unifiable.
|
10
|
+
|
11
|
+
try:
|
12
|
+
True
|
13
|
+
except NameError:
|
14
|
+
setattr(__builtins__, 'True', 1)
|
15
|
+
setattr(__builtins__, 'False', 0)
|
16
|
+
|
17
|
+
def has_key(x, y):
|
18
|
+
if hasattr(x, 'has_key'): return x.has_key(y)
|
19
|
+
else: return y in x
|
20
|
+
|
21
|
+
try:
|
22
|
+
import htmlentitydefs
|
23
|
+
import urlparse
|
24
|
+
import HTMLParser
|
25
|
+
except ImportError: #Python3
|
26
|
+
import html.entities as htmlentitydefs
|
27
|
+
import urllib.parse as urlparse
|
28
|
+
import html.parser as HTMLParser
|
29
|
+
try: #Python3
|
30
|
+
import urllib.request as urllib
|
31
|
+
except:
|
32
|
+
import urllib
|
33
|
+
import optparse, re, sys, codecs, types
|
34
|
+
|
35
|
+
try: from textwrap import wrap
|
36
|
+
except: pass
|
37
|
+
|
38
|
+
# Use Unicode characters instead of their ascii psuedo-replacements
|
39
|
+
UNICODE_SNOB = 0
|
40
|
+
|
41
|
+
# Escape all special characters. Output is less readable, but avoids corner case formatting issues.
|
42
|
+
ESCAPE_SNOB = 0
|
43
|
+
|
44
|
+
# Put the links after each paragraph instead of at the end.
|
45
|
+
LINKS_EACH_PARAGRAPH = 0
|
46
|
+
|
47
|
+
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
|
48
|
+
BODY_WIDTH = 78
|
49
|
+
|
50
|
+
# Don't show internal links (href="#local-anchor") -- corresponding link targets
|
51
|
+
# won't be visible in the plain text file anyway.
|
52
|
+
SKIP_INTERNAL_LINKS = True
|
53
|
+
|
54
|
+
# Use inline, rather than reference, formatting for images and links
|
55
|
+
INLINE_LINKS = True
|
56
|
+
|
57
|
+
# Number of pixels Google indents nested lists
|
58
|
+
GOOGLE_LIST_INDENT = 36
|
59
|
+
|
60
|
+
IGNORE_ANCHORS = False
|
61
|
+
IGNORE_IMAGES = False
|
62
|
+
IGNORE_EMPHASIS = False
|
63
|
+
|
64
|
+
### Entity Nonsense ###
|
65
|
+
|
66
|
+
def name2cp(k):
|
67
|
+
if k == 'apos': return ord("'")
|
68
|
+
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
|
69
|
+
return htmlentitydefs.name2codepoint[k]
|
70
|
+
else:
|
71
|
+
k = htmlentitydefs.entitydefs[k]
|
72
|
+
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
|
73
|
+
return ord(codecs.latin_1_decode(k)[0])
|
74
|
+
|
75
|
+
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
|
76
|
+
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
|
77
|
+
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
|
78
|
+
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
|
79
|
+
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
|
80
|
+
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
|
81
|
+
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
|
82
|
+
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
|
83
|
+
'lrm':'', 'rlm':''}
|
84
|
+
|
85
|
+
unifiable_n = {}
|
86
|
+
|
87
|
+
for k in unifiable.keys():
|
88
|
+
unifiable_n[name2cp(k)] = unifiable[k]
|
89
|
+
|
90
|
+
### End Entity Nonsense ###
|
91
|
+
|
92
|
+
def onlywhite(line):
|
93
|
+
"""Return true if the line does only consist of whitespace characters."""
|
94
|
+
for c in line:
|
95
|
+
if c is not ' ' and c is not ' ':
|
96
|
+
return c is ' '
|
97
|
+
return line
|
98
|
+
|
99
|
+
def hn(tag):
|
100
|
+
if tag[0] == 'h' and len(tag) == 2:
|
101
|
+
try:
|
102
|
+
n = int(tag[1])
|
103
|
+
if n in range(1, 10): return n
|
104
|
+
except ValueError: return 0
|
105
|
+
|
106
|
+
def dumb_property_dict(style):
|
107
|
+
"""returns a hash of css attributes"""
|
108
|
+
return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
|
109
|
+
|
110
|
+
def dumb_css_parser(data):
|
111
|
+
"""returns a hash of css selectors, each of which contains a hash of css attributes"""
|
112
|
+
# remove @import sentences
|
113
|
+
data += ';'
|
114
|
+
importIndex = data.find('@import')
|
115
|
+
while importIndex != -1:
|
116
|
+
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
|
117
|
+
importIndex = data.find('@import')
|
118
|
+
|
119
|
+
# parse the css. reverted from dictionary compehension in order to support older pythons
|
120
|
+
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
|
121
|
+
try:
|
122
|
+
elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
|
123
|
+
except ValueError:
|
124
|
+
elements = {} # not that important
|
125
|
+
|
126
|
+
return elements
|
127
|
+
|
128
|
+
def element_style(attrs, style_def, parent_style):
|
129
|
+
"""returns a hash of the 'final' style attributes of the element"""
|
130
|
+
style = parent_style.copy()
|
131
|
+
if 'class' in attrs:
|
132
|
+
for css_class in attrs['class'].split():
|
133
|
+
css_style = style_def['.' + css_class]
|
134
|
+
style.update(css_style)
|
135
|
+
if 'style' in attrs:
|
136
|
+
immediate_style = dumb_property_dict(attrs['style'])
|
137
|
+
style.update(immediate_style)
|
138
|
+
return style
|
139
|
+
|
140
|
+
def google_list_style(style):
|
141
|
+
"""finds out whether this is an ordered or unordered list"""
|
142
|
+
if 'list-style-type' in style:
|
143
|
+
list_style = style['list-style-type']
|
144
|
+
if list_style in ['disc', 'circle', 'square', 'none']:
|
145
|
+
return 'ul'
|
146
|
+
return 'ol'
|
147
|
+
|
148
|
+
def google_has_height(style):
|
149
|
+
"""check if the style of the element has the 'height' attribute explicitly defined"""
|
150
|
+
if 'height' in style:
|
151
|
+
return True
|
152
|
+
return False
|
153
|
+
|
154
|
+
def google_text_emphasis(style):
|
155
|
+
"""return a list of all emphasis modifiers of the element"""
|
156
|
+
emphasis = []
|
157
|
+
if 'text-decoration' in style:
|
158
|
+
emphasis.append(style['text-decoration'])
|
159
|
+
if 'font-style' in style:
|
160
|
+
emphasis.append(style['font-style'])
|
161
|
+
if 'font-weight' in style:
|
162
|
+
emphasis.append(style['font-weight'])
|
163
|
+
return emphasis
|
164
|
+
|
165
|
+
def google_fixed_width_font(style):
|
166
|
+
"""check if the css of the current element defines a fixed width font"""
|
167
|
+
font_family = ''
|
168
|
+
if 'font-family' in style:
|
169
|
+
font_family = style['font-family']
|
170
|
+
if 'Courier New' == font_family or 'Consolas' == font_family:
|
171
|
+
return True
|
172
|
+
return False
|
173
|
+
|
174
|
+
def list_numbering_start(attrs):
|
175
|
+
"""extract numbering from list element attributes"""
|
176
|
+
if 'start' in attrs:
|
177
|
+
return int(attrs['start']) - 1
|
178
|
+
else:
|
179
|
+
return 0
|
180
|
+
|
181
|
+
class HTML2Text(HTMLParser.HTMLParser):
|
182
|
+
def __init__(self, out=None, baseurl=''):
|
183
|
+
HTMLParser.HTMLParser.__init__(self)
|
184
|
+
|
185
|
+
# Config options
|
186
|
+
self.unicode_snob = UNICODE_SNOB
|
187
|
+
self.escape_snob = ESCAPE_SNOB
|
188
|
+
self.links_each_paragraph = LINKS_EACH_PARAGRAPH
|
189
|
+
self.body_width = BODY_WIDTH
|
190
|
+
self.skip_internal_links = SKIP_INTERNAL_LINKS
|
191
|
+
self.inline_links = INLINE_LINKS
|
192
|
+
self.google_list_indent = GOOGLE_LIST_INDENT
|
193
|
+
self.ignore_links = IGNORE_ANCHORS
|
194
|
+
self.ignore_images = IGNORE_IMAGES
|
195
|
+
self.ignore_emphasis = IGNORE_EMPHASIS
|
196
|
+
self.google_doc = False
|
197
|
+
self.ul_item_mark = '*'
|
198
|
+
self.emphasis_mark = '_'
|
199
|
+
self.strong_mark = '**'
|
200
|
+
|
201
|
+
if out is None:
|
202
|
+
self.out = self.outtextf
|
203
|
+
else:
|
204
|
+
self.out = out
|
205
|
+
|
206
|
+
self.outtextlist = [] # empty list to store output characters before they are "joined"
|
207
|
+
|
208
|
+
try:
|
209
|
+
self.outtext = unicode()
|
210
|
+
except NameError: # Python3
|
211
|
+
self.outtext = str()
|
212
|
+
|
213
|
+
self.quiet = 0
|
214
|
+
self.p_p = 0 # number of newline character to print before next output
|
215
|
+
self.outcount = 0
|
216
|
+
self.start = 1
|
217
|
+
self.space = 0
|
218
|
+
self.a = []
|
219
|
+
self.astack = []
|
220
|
+
self.maybe_automatic_link = None
|
221
|
+
self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
|
222
|
+
self.acount = 0
|
223
|
+
self.list = []
|
224
|
+
self.blockquote = 0
|
225
|
+
self.pre = 0
|
226
|
+
self.startpre = 0
|
227
|
+
self.code = False
|
228
|
+
self.br_toggle = ''
|
229
|
+
self.lastWasNL = 0
|
230
|
+
self.lastWasList = False
|
231
|
+
self.style = 0
|
232
|
+
self.style_def = {}
|
233
|
+
self.tag_stack = []
|
234
|
+
self.emphasis = 0
|
235
|
+
self.drop_white_space = 0
|
236
|
+
self.inheader = False
|
237
|
+
self.abbr_title = None # current abbreviation definition
|
238
|
+
self.abbr_data = None # last inner HTML (for abbr being defined)
|
239
|
+
self.abbr_list = {} # stack of abbreviations to write later
|
240
|
+
self.baseurl = baseurl
|
241
|
+
|
242
|
+
try: del unifiable_n[name2cp('nbsp')]
|
243
|
+
except KeyError: pass
|
244
|
+
unifiable['nbsp'] = ' _place_holder;'
|
245
|
+
|
246
|
+
|
247
|
+
def feed(self, data):
|
248
|
+
data = data.replace("</' + 'script>", "</ignore>")
|
249
|
+
HTMLParser.HTMLParser.feed(self, data)
|
250
|
+
|
251
|
+
def handle(self, data):
|
252
|
+
self.feed(data)
|
253
|
+
self.feed("")
|
254
|
+
return self.optwrap(self.close())
|
255
|
+
|
256
|
+
def outtextf(self, s):
|
257
|
+
self.outtextlist.append(s)
|
258
|
+
if s: self.lastWasNL = s[-1] == '\n'
|
259
|
+
|
260
|
+
def close(self):
|
261
|
+
HTMLParser.HTMLParser.close(self)
|
262
|
+
|
263
|
+
self.pbr()
|
264
|
+
self.o('', 0, 'end')
|
265
|
+
|
266
|
+
self.outtext = self.outtext.join(self.outtextlist)
|
267
|
+
if self.unicode_snob:
|
268
|
+
nbsp = unichr(name2cp('nbsp'))
|
269
|
+
else:
|
270
|
+
nbsp = u' '
|
271
|
+
self.outtext = self.outtext.replace(u' _place_holder;', nbsp)
|
272
|
+
|
273
|
+
return self.outtext
|
274
|
+
|
275
|
+
def handle_charref(self, c):
|
276
|
+
self.o(self.charref(c), 1)
|
277
|
+
|
278
|
+
def handle_entityref(self, c):
|
279
|
+
self.o(self.entityref(c), 1)
|
280
|
+
|
281
|
+
def handle_starttag(self, tag, attrs):
|
282
|
+
self.handle_tag(tag, attrs, 1)
|
283
|
+
|
284
|
+
def handle_endtag(self, tag):
|
285
|
+
self.handle_tag(tag, None, 0)
|
286
|
+
|
287
|
+
def previousIndex(self, attrs):
|
288
|
+
""" returns the index of certain set of attributes (of a link) in the
|
289
|
+
self.a list
|
290
|
+
|
291
|
+
If the set of attributes is not found, returns None
|
292
|
+
"""
|
293
|
+
if not has_key(attrs, 'href'): return None
|
294
|
+
|
295
|
+
i = -1
|
296
|
+
for a in self.a:
|
297
|
+
i += 1
|
298
|
+
match = 0
|
299
|
+
|
300
|
+
if has_key(a, 'href') and a['href'] == attrs['href']:
|
301
|
+
if has_key(a, 'title') or has_key(attrs, 'title'):
|
302
|
+
if (has_key(a, 'title') and has_key(attrs, 'title') and
|
303
|
+
a['title'] == attrs['title']):
|
304
|
+
match = True
|
305
|
+
else:
|
306
|
+
match = True
|
307
|
+
|
308
|
+
if match: return i
|
309
|
+
|
310
|
+
def drop_last(self, nLetters):
|
311
|
+
if not self.quiet:
|
312
|
+
self.outtext = self.outtext[:-nLetters]
|
313
|
+
|
314
|
+
def handle_emphasis(self, start, tag_style, parent_style):
|
315
|
+
"""handles various text emphases"""
|
316
|
+
tag_emphasis = google_text_emphasis(tag_style)
|
317
|
+
parent_emphasis = google_text_emphasis(parent_style)
|
318
|
+
|
319
|
+
# handle Google's text emphasis
|
320
|
+
strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough
|
321
|
+
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
|
322
|
+
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
|
323
|
+
fixed = google_fixed_width_font(tag_style) and not \
|
324
|
+
google_fixed_width_font(parent_style) and not self.pre
|
325
|
+
|
326
|
+
if start:
|
327
|
+
# crossed-out text must be handled before other attributes
|
328
|
+
# in order not to output qualifiers unnecessarily
|
329
|
+
if bold or italic or fixed:
|
330
|
+
self.emphasis += 1
|
331
|
+
if strikethrough:
|
332
|
+
self.quiet += 1
|
333
|
+
if italic:
|
334
|
+
self.o(self.emphasis_mark)
|
335
|
+
self.drop_white_space += 1
|
336
|
+
if bold:
|
337
|
+
self.o(self.strong_mark)
|
338
|
+
self.drop_white_space += 1
|
339
|
+
if fixed:
|
340
|
+
self.o('`')
|
341
|
+
self.drop_white_space += 1
|
342
|
+
self.code = True
|
343
|
+
else:
|
344
|
+
if bold or italic or fixed:
|
345
|
+
# there must not be whitespace before closing emphasis mark
|
346
|
+
self.emphasis -= 1
|
347
|
+
self.space = 0
|
348
|
+
self.outtext = self.outtext.rstrip()
|
349
|
+
if fixed:
|
350
|
+
if self.drop_white_space:
|
351
|
+
# empty emphasis, drop it
|
352
|
+
self.drop_last(1)
|
353
|
+
self.drop_white_space -= 1
|
354
|
+
else:
|
355
|
+
self.o('`')
|
356
|
+
self.code = False
|
357
|
+
if bold:
|
358
|
+
if self.drop_white_space:
|
359
|
+
# empty emphasis, drop it
|
360
|
+
self.drop_last(2)
|
361
|
+
self.drop_white_space -= 1
|
362
|
+
else:
|
363
|
+
self.o(self.strong_mark)
|
364
|
+
if italic:
|
365
|
+
if self.drop_white_space:
|
366
|
+
# empty emphasis, drop it
|
367
|
+
self.drop_last(1)
|
368
|
+
self.drop_white_space -= 1
|
369
|
+
else:
|
370
|
+
self.o(self.emphasis_mark)
|
371
|
+
# space is only allowed after *all* emphasis marks
|
372
|
+
if (bold or italic) and not self.emphasis:
|
373
|
+
self.o(" ")
|
374
|
+
if strikethrough:
|
375
|
+
self.quiet -= 1
|
376
|
+
|
377
|
+
def handle_tag(self, tag, attrs, start):
|
378
|
+
#attrs = fixattrs(attrs)
|
379
|
+
if attrs is None:
|
380
|
+
attrs = {}
|
381
|
+
else:
|
382
|
+
attrs = dict(attrs)
|
383
|
+
|
384
|
+
if self.google_doc:
|
385
|
+
# the attrs parameter is empty for a closing tag. in addition, we
|
386
|
+
# need the attributes of the parent nodes in order to get a
|
387
|
+
# complete style description for the current element. we assume
|
388
|
+
# that google docs export well formed html.
|
389
|
+
parent_style = {}
|
390
|
+
if start:
|
391
|
+
if self.tag_stack:
|
392
|
+
parent_style = self.tag_stack[-1][2]
|
393
|
+
tag_style = element_style(attrs, self.style_def, parent_style)
|
394
|
+
self.tag_stack.append((tag, attrs, tag_style))
|
395
|
+
else:
|
396
|
+
dummy, attrs, tag_style = self.tag_stack.pop()
|
397
|
+
if self.tag_stack:
|
398
|
+
parent_style = self.tag_stack[-1][2]
|
399
|
+
|
400
|
+
if hn(tag):
|
401
|
+
self.p()
|
402
|
+
if start:
|
403
|
+
self.inheader = True
|
404
|
+
self.o(hn(tag)*"#" + ' ')
|
405
|
+
else:
|
406
|
+
self.inheader = False
|
407
|
+
return # prevent redundant emphasis marks on headers
|
408
|
+
|
409
|
+
if tag in ['p', 'div']:
|
410
|
+
if self.google_doc:
|
411
|
+
if start and google_has_height(tag_style):
|
412
|
+
self.p()
|
413
|
+
else:
|
414
|
+
self.soft_br()
|
415
|
+
else:
|
416
|
+
self.p()
|
417
|
+
|
418
|
+
if tag == "br" and start: self.o(" \n")
|
419
|
+
|
420
|
+
if tag == "hr" and start:
|
421
|
+
self.p()
|
422
|
+
self.o("* * *")
|
423
|
+
self.p()
|
424
|
+
|
425
|
+
if tag in ["head", "style", 'script']:
|
426
|
+
if start: self.quiet += 1
|
427
|
+
else: self.quiet -= 1
|
428
|
+
|
429
|
+
if tag == "style":
|
430
|
+
if start: self.style += 1
|
431
|
+
else: self.style -= 1
|
432
|
+
|
433
|
+
if tag in ["body"]:
|
434
|
+
self.quiet = 0 # sites like 9rules.com never close <head>
|
435
|
+
|
436
|
+
if tag == "blockquote":
|
437
|
+
if start:
|
438
|
+
self.p(); self.o('> ', 0, 1); self.start = 1
|
439
|
+
self.blockquote += 1
|
440
|
+
else:
|
441
|
+
self.blockquote -= 1
|
442
|
+
self.p()
|
443
|
+
|
444
|
+
if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
|
445
|
+
if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
|
446
|
+
if tag in ['del', 'strike', 's']:
|
447
|
+
if start:
|
448
|
+
self.o("<"+tag+">")
|
449
|
+
else:
|
450
|
+
self.o("</"+tag+">")
|
451
|
+
|
452
|
+
if self.google_doc:
|
453
|
+
if not self.inheader:
|
454
|
+
# handle some font attributes, but leave headers clean
|
455
|
+
self.handle_emphasis(start, tag_style, parent_style)
|
456
|
+
|
457
|
+
if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
|
458
|
+
if tag == "abbr":
|
459
|
+
if start:
|
460
|
+
self.abbr_title = None
|
461
|
+
self.abbr_data = ''
|
462
|
+
if has_key(attrs, 'title'):
|
463
|
+
self.abbr_title = attrs['title']
|
464
|
+
else:
|
465
|
+
if self.abbr_title != None:
|
466
|
+
self.abbr_list[self.abbr_data] = self.abbr_title
|
467
|
+
self.abbr_title = None
|
468
|
+
self.abbr_data = ''
|
469
|
+
|
470
|
+
if tag == "a" and not self.ignore_links:
|
471
|
+
if start:
|
472
|
+
if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
|
473
|
+
self.astack.append(attrs)
|
474
|
+
self.maybe_automatic_link = attrs['href']
|
475
|
+
else:
|
476
|
+
self.astack.append(None)
|
477
|
+
else:
|
478
|
+
if self.astack:
|
479
|
+
a = self.astack.pop()
|
480
|
+
if self.maybe_automatic_link:
|
481
|
+
self.maybe_automatic_link = None
|
482
|
+
elif a:
|
483
|
+
if self.inline_links:
|
484
|
+
self.o("](" + escape_md(a['href']) + ")")
|
485
|
+
else:
|
486
|
+
i = self.previousIndex(a)
|
487
|
+
if i is not None:
|
488
|
+
a = self.a[i]
|
489
|
+
else:
|
490
|
+
self.acount += 1
|
491
|
+
a['count'] = self.acount
|
492
|
+
a['outcount'] = self.outcount
|
493
|
+
self.a.append(a)
|
494
|
+
self.o("][" + str(a['count']) + "]")
|
495
|
+
|
496
|
+
if tag == "img" and start and not self.ignore_images:
|
497
|
+
if has_key(attrs, 'src'):
|
498
|
+
attrs['href'] = attrs['src']
|
499
|
+
alt = attrs.get('alt', '')
|
500
|
+
self.o("![" + escape_md(alt) + "]")
|
501
|
+
|
502
|
+
if self.inline_links:
|
503
|
+
self.o("(" + escape_md(attrs['href']) + ")")
|
504
|
+
else:
|
505
|
+
i = self.previousIndex(attrs)
|
506
|
+
if i is not None:
|
507
|
+
attrs = self.a[i]
|
508
|
+
else:
|
509
|
+
self.acount += 1
|
510
|
+
attrs['count'] = self.acount
|
511
|
+
attrs['outcount'] = self.outcount
|
512
|
+
self.a.append(attrs)
|
513
|
+
self.o("[" + str(attrs['count']) + "]")
|
514
|
+
|
515
|
+
if tag == 'dl' and start: self.p()
|
516
|
+
if tag == 'dt' and not start: self.pbr()
|
517
|
+
if tag == 'dd' and start: self.o(' ')
|
518
|
+
if tag == 'dd' and not start: self.pbr()
|
519
|
+
|
520
|
+
if tag in ["ol", "ul"]:
|
521
|
+
# Google Docs create sub lists as top level lists
|
522
|
+
if (not self.list) and (not self.lastWasList):
|
523
|
+
self.p()
|
524
|
+
if start:
|
525
|
+
if self.google_doc:
|
526
|
+
list_style = google_list_style(tag_style)
|
527
|
+
else:
|
528
|
+
list_style = tag
|
529
|
+
numbering_start = list_numbering_start(attrs)
|
530
|
+
self.list.append({'name':list_style, 'num':numbering_start})
|
531
|
+
else:
|
532
|
+
if self.list: self.list.pop()
|
533
|
+
self.lastWasList = True
|
534
|
+
else:
|
535
|
+
self.lastWasList = False
|
536
|
+
|
537
|
+
if tag == 'li':
|
538
|
+
self.pbr()
|
539
|
+
if start:
|
540
|
+
if self.list: li = self.list[-1]
|
541
|
+
else: li = {'name':'ul', 'num':0}
|
542
|
+
if self.google_doc:
|
543
|
+
nest_count = self.google_nest_count(tag_style)
|
544
|
+
else:
|
545
|
+
nest_count = len(self.list)
|
546
|
+
self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
|
547
|
+
if li['name'] == "ul": self.o(self.ul_item_mark + " ")
|
548
|
+
elif li['name'] == "ol":
|
549
|
+
li['num'] += 1
|
550
|
+
self.o(str(li['num'])+". ")
|
551
|
+
self.start = 1
|
552
|
+
|
553
|
+
if tag in ["table", "tr"] and start: self.p()
|
554
|
+
if tag == 'td': self.pbr()
|
555
|
+
|
556
|
+
if tag == "pre":
|
557
|
+
if start:
|
558
|
+
self.startpre = 1
|
559
|
+
self.pre = 1
|
560
|
+
else:
|
561
|
+
self.pre = 0
|
562
|
+
self.p()
|
563
|
+
|
564
|
+
def pbr(self):
|
565
|
+
if self.p_p == 0:
|
566
|
+
self.p_p = 1
|
567
|
+
|
568
|
+
def p(self):
|
569
|
+
self.p_p = 2
|
570
|
+
|
571
|
+
def soft_br(self):
|
572
|
+
self.pbr()
|
573
|
+
self.br_toggle = ' '
|
574
|
+
|
575
|
+
def o(self, data, puredata=0, force=0):
|
576
|
+
if self.abbr_data is not None:
|
577
|
+
self.abbr_data += data
|
578
|
+
|
579
|
+
if not self.quiet:
|
580
|
+
if self.google_doc:
|
581
|
+
# prevent white space immediately after 'begin emphasis' marks ('**' and '_')
|
582
|
+
lstripped_data = data.lstrip()
|
583
|
+
if self.drop_white_space and not (self.pre or self.code):
|
584
|
+
data = lstripped_data
|
585
|
+
if lstripped_data != '':
|
586
|
+
self.drop_white_space = 0
|
587
|
+
|
588
|
+
if puredata and not self.pre:
|
589
|
+
data = re.sub('\s+', ' ', data)
|
590
|
+
if data and data[0] == ' ':
|
591
|
+
self.space = 1
|
592
|
+
data = data[1:]
|
593
|
+
if not data and not force: return
|
594
|
+
|
595
|
+
if self.startpre:
|
596
|
+
#self.out(" :") #TODO: not output when already one there
|
597
|
+
if not data.startswith("\n"): # <pre>stuff...
|
598
|
+
data = "\n" + data
|
599
|
+
|
600
|
+
bq = (">" * self.blockquote)
|
601
|
+
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
|
602
|
+
|
603
|
+
if self.pre:
|
604
|
+
if not self.list:
|
605
|
+
bq += " "
|
606
|
+
#else: list content is already partially indented
|
607
|
+
for i in xrange(len(self.list)):
|
608
|
+
bq += " "
|
609
|
+
data = data.replace("\n", "\n"+bq)
|
610
|
+
|
611
|
+
if self.startpre:
|
612
|
+
self.startpre = 0
|
613
|
+
if self.list:
|
614
|
+
data = data.lstrip("\n") # use existing initial indentation
|
615
|
+
|
616
|
+
if self.start:
|
617
|
+
self.space = 0
|
618
|
+
self.p_p = 0
|
619
|
+
self.start = 0
|
620
|
+
|
621
|
+
if force == 'end':
|
622
|
+
# It's the end.
|
623
|
+
self.p_p = 0
|
624
|
+
self.out("\n")
|
625
|
+
self.space = 0
|
626
|
+
|
627
|
+
if self.p_p:
|
628
|
+
self.out((self.br_toggle+'\n'+bq)*self.p_p)
|
629
|
+
self.space = 0
|
630
|
+
self.br_toggle = ''
|
631
|
+
|
632
|
+
if self.space:
|
633
|
+
if not self.lastWasNL: self.out(' ')
|
634
|
+
self.space = 0
|
635
|
+
|
636
|
+
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
|
637
|
+
if force == "end": self.out("\n")
|
638
|
+
|
639
|
+
newa = []
|
640
|
+
for link in self.a:
|
641
|
+
if self.outcount > link['outcount']:
|
642
|
+
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
|
643
|
+
if has_key(link, 'title'): self.out(" ("+link['title']+")")
|
644
|
+
self.out("\n")
|
645
|
+
else:
|
646
|
+
newa.append(link)
|
647
|
+
|
648
|
+
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
|
649
|
+
|
650
|
+
self.a = newa
|
651
|
+
|
652
|
+
if self.abbr_list and force == "end":
|
653
|
+
for abbr, definition in self.abbr_list.items():
|
654
|
+
self.out(" *[" + abbr + "]: " + definition + "\n")
|
655
|
+
|
656
|
+
self.p_p = 0
|
657
|
+
self.out(data)
|
658
|
+
self.outcount += 1
|
659
|
+
|
660
|
+
def handle_data(self, data):
|
661
|
+
if r'\/script>' in data: self.quiet -= 1
|
662
|
+
|
663
|
+
if self.style:
|
664
|
+
self.style_def.update(dumb_css_parser(data))
|
665
|
+
|
666
|
+
if not self.maybe_automatic_link is None:
|
667
|
+
href = self.maybe_automatic_link
|
668
|
+
if href == data and self.absolute_url_matcher.match(href):
|
669
|
+
self.o("<" + data + ">")
|
670
|
+
return
|
671
|
+
else:
|
672
|
+
self.o("[")
|
673
|
+
self.maybe_automatic_link = None
|
674
|
+
|
675
|
+
if not self.code and not self.pre:
|
676
|
+
data = escape_md_section(data, snob=self.escape_snob)
|
677
|
+
self.o(data, 1)
|
678
|
+
|
679
|
+
def unknown_decl(self, data): pass
|
680
|
+
|
681
|
+
def charref(self, name):
|
682
|
+
if name[0] in ['x','X']:
|
683
|
+
c = int(name[1:], 16)
|
684
|
+
else:
|
685
|
+
c = int(name)
|
686
|
+
|
687
|
+
if not self.unicode_snob and c in unifiable_n.keys():
|
688
|
+
return unifiable_n[c]
|
689
|
+
else:
|
690
|
+
try:
|
691
|
+
return unichr(c)
|
692
|
+
except NameError: #Python3
|
693
|
+
return chr(c)
|
694
|
+
|
695
|
+
def entityref(self, c):
|
696
|
+
if not self.unicode_snob and c in unifiable.keys():
|
697
|
+
return unifiable[c]
|
698
|
+
else:
|
699
|
+
try: name2cp(c)
|
700
|
+
except KeyError: return "&" + c + ';'
|
701
|
+
else:
|
702
|
+
try:
|
703
|
+
return unichr(name2cp(c))
|
704
|
+
except NameError: #Python3
|
705
|
+
return chr(name2cp(c))
|
706
|
+
|
707
|
+
def replaceEntities(self, s):
|
708
|
+
s = s.group(1)
|
709
|
+
if s[0] == "#":
|
710
|
+
return self.charref(s[1:])
|
711
|
+
else: return self.entityref(s)
|
712
|
+
|
713
|
+
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
|
714
|
+
def unescape(self, s):
|
715
|
+
return self.r_unescape.sub(self.replaceEntities, s)
|
716
|
+
|
717
|
+
def google_nest_count(self, style):
|
718
|
+
"""calculate the nesting count of google doc lists"""
|
719
|
+
nest_count = 0
|
720
|
+
if 'margin-left' in style:
|
721
|
+
nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
|
722
|
+
return nest_count
|
723
|
+
|
724
|
+
|
725
|
+
def optwrap(self, text):
|
726
|
+
"""Wrap all paragraphs in the provided text."""
|
727
|
+
if not self.body_width:
|
728
|
+
return text
|
729
|
+
|
730
|
+
assert wrap, "Requires Python 2.3."
|
731
|
+
result = ''
|
732
|
+
newlines = 0
|
733
|
+
for para in text.split("\n"):
|
734
|
+
if len(para) > 0:
|
735
|
+
if not skipwrap(para):
|
736
|
+
result += "\n".join(wrap(para, self.body_width))
|
737
|
+
if para.endswith(' '):
|
738
|
+
result += " \n"
|
739
|
+
newlines = 1
|
740
|
+
else:
|
741
|
+
result += "\n\n"
|
742
|
+
newlines = 2
|
743
|
+
else:
|
744
|
+
if not onlywhite(para):
|
745
|
+
result += para + "\n"
|
746
|
+
newlines = 1
|
747
|
+
else:
|
748
|
+
if newlines < 2:
|
749
|
+
result += "\n"
|
750
|
+
newlines += 1
|
751
|
+
return result
|
752
|
+
|
753
|
+
ordered_list_matcher = re.compile(r'\d+\.\s')
|
754
|
+
unordered_list_matcher = re.compile(r'[-\*\+]\s')
|
755
|
+
md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
|
756
|
+
md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
757
|
+
md_dot_matcher = re.compile(r"""
|
758
|
+
^ # start of line
|
759
|
+
(\s*\d+) # optional whitespace and a number
|
760
|
+
(\.) # dot
|
761
|
+
(?=\s) # lookahead assert whitespace
|
762
|
+
""", re.MULTILINE | re.VERBOSE)
|
763
|
+
md_plus_matcher = re.compile(r"""
|
764
|
+
^
|
765
|
+
(\s*)
|
766
|
+
(\+)
|
767
|
+
(?=\s)
|
768
|
+
""", flags=re.MULTILINE | re.VERBOSE)
|
769
|
+
md_dash_matcher = re.compile(r"""
|
770
|
+
^
|
771
|
+
(\s*)
|
772
|
+
(-)
|
773
|
+
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
|
774
|
+
# or another dash (header or hr)
|
775
|
+
""", flags=re.MULTILINE | re.VERBOSE)
|
776
|
+
slash_chars = r'\`*_{}[]()#+-.!'
|
777
|
+
md_backslash_matcher = re.compile(r'''
|
778
|
+
(\\) # match one slash
|
779
|
+
(?=[%s]) # followed by a char that requires escaping
|
780
|
+
''' % re.escape(slash_chars),
|
781
|
+
flags=re.VERBOSE)
|
782
|
+
|
783
|
+
def skipwrap(para):
|
784
|
+
# If the text begins with four spaces or one tab, it's a code block; don't wrap
|
785
|
+
if para[0:4] == ' ' or para[0] == '\t':
|
786
|
+
return True
|
787
|
+
# If the text begins with only two "--", possibly preceded by whitespace, that's
|
788
|
+
# an emdash; so wrap.
|
789
|
+
stripped = para.lstrip()
|
790
|
+
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
791
|
+
return False
|
792
|
+
# I'm not sure what this is for; I thought it was to detect lists, but there's
|
793
|
+
# a <br>-inside-<span> case in one of the tests that also depends upon it.
|
794
|
+
if stripped[0:1] == '-' or stripped[0:1] == '*':
|
795
|
+
return True
|
796
|
+
# If the text begins with a single -, *, or +, followed by a space, or an integer,
|
797
|
+
# followed by a ., followed by a space (in either case optionally preceeded by
|
798
|
+
# whitespace), it's a list; don't wrap.
|
799
|
+
if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
|
800
|
+
return True
|
801
|
+
return False
|
802
|
+
|
803
|
+
def wrapwrite(text):
|
804
|
+
text = text.encode('utf-8')
|
805
|
+
try: #Python3
|
806
|
+
sys.stdout.buffer.write(text)
|
807
|
+
except AttributeError:
|
808
|
+
sys.stdout.write(text)
|
809
|
+
|
810
|
+
def html2text(html, baseurl=''):
|
811
|
+
h = HTML2Text(baseurl=baseurl)
|
812
|
+
return h.handle(html)
|
813
|
+
|
814
|
+
def unescape(s, unicode_snob=False):
|
815
|
+
h = HTML2Text()
|
816
|
+
h.unicode_snob = unicode_snob
|
817
|
+
return h.unescape(s)
|
818
|
+
|
819
|
+
def escape_md(text):
|
820
|
+
"""Escapes markdown-sensitive characters within other markdown constructs."""
|
821
|
+
return md_chars_matcher.sub(r"\\\1", text)
|
822
|
+
|
823
|
+
def escape_md_section(text, snob=False):
|
824
|
+
"""Escapes markdown-sensitive characters across whole document sections."""
|
825
|
+
text = md_backslash_matcher.sub(r"\\\1", text)
|
826
|
+
if snob:
|
827
|
+
text = md_chars_matcher_all.sub(r"\\\1", text)
|
828
|
+
text = md_dot_matcher.sub(r"\1\\\2", text)
|
829
|
+
text = md_plus_matcher.sub(r"\1\\\2", text)
|
830
|
+
text = md_dash_matcher.sub(r"\1\\\2", text)
|
831
|
+
return text
|
832
|
+
|
833
|
+
|
834
|
+
def main():
|
835
|
+
baseurl = ''
|
836
|
+
|
837
|
+
p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
|
838
|
+
version='%prog ' + __version__)
|
839
|
+
p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
|
840
|
+
default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
|
841
|
+
p.add_option("--ignore-links", dest="ignore_links", action="store_true",
|
842
|
+
default=IGNORE_ANCHORS, help="don't include any formatting for links")
|
843
|
+
p.add_option("--ignore-images", dest="ignore_images", action="store_true",
|
844
|
+
default=IGNORE_IMAGES, help="don't include any formatting for images")
|
845
|
+
p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
|
846
|
+
default=False, help="convert an html-exported Google Document")
|
847
|
+
p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
|
848
|
+
default=False, help="use a dash rather than a star for unordered list items")
|
849
|
+
p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
|
850
|
+
default=False, help="use an asterisk rather than an underscore for emphasized text")
|
851
|
+
p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
|
852
|
+
default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
|
853
|
+
p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
|
854
|
+
default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
|
855
|
+
p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
|
856
|
+
default=False, help="hide strike-through text. only relevant when -g is specified as well")
|
857
|
+
p.add_option("--escape-all", action="store_true", dest="escape_snob",
|
858
|
+
default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
|
859
|
+
(options, args) = p.parse_args()
|
860
|
+
|
861
|
+
# process input
|
862
|
+
encoding = "utf-8"
|
863
|
+
if len(args) > 0:
|
864
|
+
file_ = args[0]
|
865
|
+
if len(args) == 2:
|
866
|
+
encoding = args[1]
|
867
|
+
if len(args) > 2:
|
868
|
+
p.error('Too many arguments')
|
869
|
+
|
870
|
+
if file_.startswith('http://') or file_.startswith('https://'):
|
871
|
+
baseurl = file_
|
872
|
+
j = urllib.urlopen(baseurl)
|
873
|
+
data = j.read()
|
874
|
+
if encoding is None:
|
875
|
+
try:
|
876
|
+
from feedparser import _getCharacterEncoding as enc
|
877
|
+
except ImportError:
|
878
|
+
enc = lambda x, y: ('utf-8', 1)
|
879
|
+
encoding = enc(j.headers, data)[0]
|
880
|
+
if encoding == 'us-ascii':
|
881
|
+
encoding = 'utf-8'
|
882
|
+
else:
|
883
|
+
data = open(file_, 'rb').read()
|
884
|
+
if encoding is None:
|
885
|
+
try:
|
886
|
+
from chardet import detect
|
887
|
+
except ImportError:
|
888
|
+
detect = lambda x: {'encoding': 'utf-8'}
|
889
|
+
encoding = detect(data)['encoding']
|
890
|
+
else:
|
891
|
+
data = sys.stdin.read()
|
892
|
+
|
893
|
+
data = data.decode(encoding)
|
894
|
+
h = HTML2Text(baseurl=baseurl)
|
895
|
+
# handle options
|
896
|
+
if options.ul_style_dash: h.ul_item_mark = '-'
|
897
|
+
if options.em_style_asterisk:
|
898
|
+
h.emphasis_mark = '*'
|
899
|
+
h.strong_mark = '__'
|
900
|
+
|
901
|
+
h.body_width = options.body_width
|
902
|
+
h.list_indent = options.list_indent
|
903
|
+
h.ignore_emphasis = options.ignore_emphasis
|
904
|
+
h.ignore_links = options.ignore_links
|
905
|
+
h.ignore_images = options.ignore_images
|
906
|
+
h.google_doc = options.google_doc
|
907
|
+
h.hide_strikethrough = options.hide_strikethrough
|
908
|
+
h.escape_snob = options.escape_snob
|
909
|
+
|
910
|
+
wrapwrite(h.handle(data))
|
911
|
+
|
912
|
+
|
913
|
+
if __name__ == "__main__":
|
914
|
+
main()
|