habari2md 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c2fb0d61a3de12d336d24ab9b5d3e47ec7b9a098
4
+ data.tar.gz: 063bedd9049364662e8e200efbf4a2a1c97a9f46
5
+ SHA512:
6
+ metadata.gz: 17550bf8808495e447d04fe9f5475e170a28f8119319b89a5099aad97da5569b1d7a95c4cb3e7961c742536ca5851a73939053f25a1ef85c3ca255b6597fe907
7
+ data.tar.gz: 5c3a0c209e76eceb05092ed68e9fa1ab555f2bb63172357f70d068657f1e6094caa56e637916452f30fd9a0bdaa726708034cc45f6fb20a3d44024b292fa1ecb
@@ -0,0 +1,2 @@
1
+ .����-��o�W�Q����<�C�^d3��_�A�垌��{)YT�o.᎝���B�m|��y���ĂDƠ���/�����2%�*��Q��O�nY�+��\!b����ղ
2
+ j'��(s
Binary file
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in habari2md.gemspec
4
+ gemspec
@@ -0,0 +1,14 @@
1
+ Copyright (c) 2014 Arnaud Berthomier
2
+
3
+ habari2md is free software: you can redistribute it and/or modify it
4
+ under the terms of the GNU General Public License as published by the
5
+ Free Software Foundation, either version 3 of the License, or (at your
6
+ option) any later version.
7
+
8
+ habari2md is distributed in the hope that it will be useful, but WITHOUT
9
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11
+ for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
@@ -0,0 +1,64 @@
1
+ # Habari2md
2
+
3
+ This is a dirty little Ruby program to export a [Habari][habari] blog to
4
+ markdown format. I used it to avoid installing PHP on a small VPS in order to
5
+ run a tiny blog of ~2000 posts.
6
+
7
+ The program makes a few assumptions about your setup, and this conditions what
8
+ you should expect to get from it.
9
+
10
+ * It will connect to a MariaDB/MySQL database,
11
+ * fetch all of its posts and:
12
+ * dump one file per published post in the `out` directory ;
13
+ * use a filename like `YYYY-MM-DD-post-slug.md` where `YYYY-MM-DD` are the
14
+ year, month, and month day when a particular post was published ;
15
+ * and format a post header with:
16
+
17
+ ```
18
+ title: The original post title
19
+ author: The author's username
20
+ ```
21
+
22
+ This process can be pretty specific, and if it does not fit your setup, feel
23
+ free to file an issue or, better, send a pull-request. ;)
24
+
25
+ # Dependencies
26
+
27
+ * Ruby >= 1.9
28
+ * Python >= 2.x
29
+
30
+ # Installation
31
+
32
+ `gem install habari2md`
33
+
34
+ # Usage
35
+
36
+ ```
37
+ $ habari2md -h
38
+ Usage: habari2md [options]
39
+ -o, --output [DIR] Output directory
40
+ -s, --host [HOST] Database host
41
+ -d, --db [DB] Database name
42
+ -u, --user [USER] Database user
43
+ -p, --password [PASS] Database password
44
+ -h, --help Show this message
45
+ $ habari2md -o foobar -d my_blog_database -h localhost -u sql_user -p sql_password
46
+ I, [2014-01-08T23:31:20.771303 #74090] INFO -- : Exporting 12345 posts...
47
+ I, [2014-01-08T23:31:50.618731 #74090] INFO -- : 50% to go
48
+ I, [2014-01-08T23:32:20.081583 #74090] INFO -- : We're done.
49
+ D, [2014-01-08T23:32:20.083582 #74090] DEBUG -- : Terminating 6 actors...
50
+ W, [2014-01-08T23:32:20.084398 #74090] WARN -- : Terminating task: type=:finalizer, meta={:method_name=>:__shutdown__}, status=:callwait
51
+
52
+ ```
53
+
54
+ # License
55
+
56
+ GPL 3.0
57
+
58
+ Note: this distribution contains Aaron Swartz [html2text][html2text] GPL
59
+ licensed program. As a matter of fact, we fork one process to convert each post
60
+ from HTML to [Markdown][markdown], yay!
61
+
62
+ [habari]: http://habariproject.org/
63
+ [html2text]: http://www.aaronsw.com/2002/html2text/
64
+ [markdown]: http://daringfireball.net/projects/markdown/
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift(File.dirname(__FILE__) + '../lib')
4
+
5
+ require 'optparse'
6
+ require 'habari2md'
7
+
8
+ options = {
9
+ out: './out',
10
+ host: 'localhost',
11
+ user: 'root',
12
+ password: 'root',
13
+ }
14
+
15
+ OptionParser.new do |opts|
16
+ opts.banner = 'Usage: habari2md [options]'
17
+
18
+ opts.on("-o", "--output [DIR]", "Output directory") do |dir|
19
+ options[:out] = dir
20
+ end
21
+
22
+ opts.on("-s", "--host [HOST]", "Database host") do |host|
23
+ options[:host] = host
24
+ end
25
+
26
+ opts.on("-d", "--db [DB]", "Database name") do |name|
27
+ options[:db] = name
28
+ end
29
+
30
+ opts.on("-u", "--user [USER]", "Database user") do |user|
31
+ options[:user] = user
32
+ end
33
+
34
+ opts.on("-p", "--password [PASS]", "Database password") do |pass|
35
+ options[:password] = pass
36
+ end
37
+
38
+ opts.on("-h", "--help", "Show this message") do
39
+ puts opts
40
+ exit
41
+ end
42
+ end.parse!
43
+
44
+ Habari2md::Exporter.new(options).export_posts(options[:out])
@@ -0,0 +1,20 @@
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIDXDCCAkSgAwIBAgIBATANBgkqhkiG9w0BAQUFADA6MQswCQYDVQQDDAJvejEW
3
+ MBQGCgmSJomT8ixkARkWBmN5cHJpbzETMBEGCgmSJomT8ixkARkWA25ldDAeFw0x
4
+ NDAxMDkxNzE0NDVaFw0xNTAxMDkxNzE0NDVaMDoxCzAJBgNVBAMMAm96MRYwFAYK
5
+ CZImiZPyLGQBGRYGY3lwcmlvMRMwEQYKCZImiZPyLGQBGRYDbmV0MIIBIjANBgkq
6
+ hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy10tbHJlv/nomAnN23gT/9WF0Sfr/6/L
7
+ o8rkkmtFgI4gZKpY3RmmhJavlzw7Pq3hT50AN+gpacyS6GJ6NRhyR59T7EK0Mar0
8
+ 7vCJhwW8EqjCjI2LVlv5NgJsQE9aFaNvNAl8cMuuWSw3UArB2ZRKsdE1J4KBTBpw
9
+ 7agSPppFarNuHKyAXXsg2rfBmkDvfUKXE+8BccQ3ga1guhfFTAQgk8zLjE21opti
10
+ 7qZbWToBSsV6dzBxpIWVkIcX2HnXsrpE1IJbXBzy60L5kHchzn+o2BB7wemBSMvk
11
+ yOaC2KRI5Xiy/THIZhheKGAHMvbu7xbz3Wt12J+H5iRBmE+VV/IRvwIDAQABo20w
12
+ azAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUP61Rx/1umQ17mKwZ
13
+ nGNam5fTDbMwGAYDVR0RBBEwD4ENb3pAY3lwcmlvLm5ldDAYBgNVHRIEETAPgQ1v
14
+ ekBjeXByaW8ubmV0MA0GCSqGSIb3DQEBBQUAA4IBAQC0CN++vyu1zcmOyckEHCa1
15
+ sk579L0i2DRO7bma9t+Z8Th6WVQqXGen0YYxznSzCPqQHN650IItnDUkujVHMI/g
16
+ ctUmyPXUryOA6EqFi0l+t7QSRysxy/79rZCIRufhFbsNhbwWMwUAEmHmJ2BHO7g4
17
+ EEI8FdoHY2xWEZ1SBu0gzn0Kmi5u1I6/i3NmvKchmIK3eQcPtu0xwSuFEw7SINcu
18
+ hfXfqFqS3mCcykIEz+V7ZRcIaiQse+263YcyYSYRws3EvEQH7C7XnUF7/Y6TpwnI
19
+ QDKpCyE1PBhKqihfimirfnkLKw1ZaUY9Nd8UpOopW8pA3eUdUqo0yJe6IQ7s8LyR
20
+ -----END CERTIFICATE-----
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'habari2md/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "habari2md"
8
+ spec.version = Habari2md::VERSION
9
+ spec.authors = ["Arnaud Berthomier"]
10
+ spec.email = ["oz@cyprio.net"]
11
+ spec.summary = %q{Habari to markdown}
12
+ spec.description = %q{Dump a Habari blog posts to Markdown format}
13
+ spec.homepage = "https://github.com/oz/habari2md"
14
+ spec.license = "GPL v3"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = []
19
+ spec.require_paths = ["lib"]
20
+ spec.cert_chain = ["certs/oz.pem"]
21
+ spec.signing_key = File.expand_path("~/.ssh/gem-private_key.pem") if $0 =~ /gem\z/
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.5"
24
+ spec.add_development_dependency "rake"
25
+
26
+ spec.add_dependency "celluloid", "~> 0.15"
27
+ spec.add_dependency "sequel", "~> 4.5"
28
+ spec.add_dependency "mysql", "~> 2.9"
29
+ end
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env ruby
2
+ require 'fileutils'
3
+ require 'pathname'
4
+
5
+ require 'celluloid'
6
+ require 'sequel'
7
+
8
+ module Habari2md
9
+ # @class Habari2md::Text Text helpers
10
+ class Text
11
+ # Shameless snatch from Rails.
12
+ # @param [String] text
13
+ # @return [String]
14
+ def self.simple_format(text)
15
+ text = '' if text.nil?
16
+ text = text.dup
17
+ start_tag = '<p>'
18
+ text = text.to_str
19
+ text.gsub!(/\r\n?/, "\n") # \r\n and \r -> \n
20
+ text.gsub!(/\n\n+/, "</p>\n\n#{start_tag}") # 2+ newline -> paragraph
21
+ text.gsub!(/([^\n]\n)(?=[^\n])/, '\1<br />') # 1 newline -> br
22
+ text.insert(0, start_tag)
23
+ text << '</p>'
24
+ return text
25
+ end
26
+
27
+ # Fork (!) html2text.py to convert form HTML to Markdown.
28
+ #
29
+ # @param [String] content
30
+ # @reutnr [String] Markdown content
31
+ def self.html2text(content)
32
+ IO.popen(html2text_script, "r+") do |io|
33
+ io.write content
34
+ io.close_write
35
+ content = io.read
36
+ io.close_read
37
+ end
38
+ content
39
+ end
40
+
41
+ protected
42
+
43
+ def self.html2text_script
44
+ @html2text ||= Pathname.new(File.dirname(__FILE__))
45
+ .join('vendor', 'html2text.py').to_s
46
+ end
47
+ end
48
+
49
+ # @class Habari2md::Exporter
50
+ # @example Export stuff
51
+ # worker = Habari2md::Exporter.new(db: 'foo', user: 'root')
52
+ # worker.export_posts("./out")
53
+ class Exporter
54
+ attr_reader :db
55
+ include Celluloid
56
+ include Celluloid::Logger
57
+
58
+ def initialize(opts = {})
59
+ @db = Sequel.connect(db_uri opts)
60
+ @counter = 0
61
+ @halfway = 0
62
+
63
+ # Cache users
64
+ @users = @db[:users].all.inject({}) do |cache, user|
65
+ cache.merge!([user[:id]] => user)
66
+ end
67
+ end
68
+
69
+ def posts
70
+ db[:posts].order(:modified)
71
+ end
72
+
73
+ # @return [Hash]
74
+ def user(id)
75
+ @users.fetch(id, {})
76
+ end
77
+
78
+ def export_posts(directory)
79
+ FileUtils.mkdir_p(directory) unless File.directory?(directory)
80
+
81
+ @counter = posts.count
82
+ @halfway = @counter / 2
83
+
84
+ info "Exporting #{@counter} posts..."
85
+
86
+ pool = Habari2md::PostExporter.pool(args: [directory, current_actor])
87
+ posts.each { |post| pool.async.export(post) }
88
+
89
+ wait(:done)
90
+ info "We're done."
91
+ end
92
+
93
+ # Called by PostExport when an export operation has finished.
94
+ def post_exported(post_id)
95
+ @counter -= 1
96
+ info "50% to go" if @counter == @halfway
97
+ signal(:done) if @counter == 0
98
+ end
99
+
100
+ protected
101
+
102
+ def db_uri(opts)
103
+ "mysql://#{opts[:user]}:#{opts[:password]}@#{opts[:host]}/#{opts[:db]}"
104
+ end
105
+ end
106
+
107
+ # @class Habari2md::PostExporter Export one post
108
+ class PostExporter
109
+ include Celluloid
110
+
111
+ # Output directory
112
+ attr_reader :dir
113
+
114
+ # Manager actor
115
+ attr_reader :manager
116
+
117
+ def initialize(dest_dir, manager_actor)
118
+ @dir = Pathname.new(dest_dir)
119
+ @manager = manager_actor
120
+ end
121
+
122
+ # Placeholder title for untitled posts
123
+ def untitled
124
+ "Untitled"
125
+ end
126
+
127
+ # Signal the managing actor when a post has been exported
128
+ def done(post = {})
129
+ manager.post_exported(post[:id])
130
+ end
131
+
132
+ # Export one post to disk
133
+ # @param [Hash] post
134
+ def export(post)
135
+ # Ignore deleted posts and drafts.
136
+ return done(post) unless published?(post)
137
+
138
+ author = manager.user(post[:user_id])[:username]
139
+ title = post[:title].gsub(/[\r\n]/, '')
140
+ title = untitled if title == ""
141
+ date = Time.strptime(post[:pubdate].to_s, "%s").strftime("%Y-%m-%d")
142
+ filename = dir.join("#{date}-#{post[:slug]}.md")
143
+ return done(post) if File.exists?(filename) && ENV['FORCE'] == nil
144
+
145
+ # Make sure content is at least formatted with <p> tags before
146
+ # conversion.
147
+ content = Habari2md::Text.simple_format(post[:content])
148
+ File.open(filename, 'w+') do |fh|
149
+ fh << "---\n"
150
+ fh << "title: #{title}\n"
151
+ fh << "author: #{author}\n" unless author == nil
152
+ fh << "---\n\n"
153
+ fh << Habari2md::Text.html2text(content)
154
+ end
155
+
156
+ done(post)
157
+ end
158
+
159
+ # This actually depends on the values in the poststatus table.
160
+ def published?(post)
161
+ post[:status] == 2
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,3 @@
1
+ module Habari2md
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,914 @@
1
+ #!/usr/bin/env python
2
+ """html2text: Turn HTML into equivalent Markdown-structured text."""
3
+ __version__ = "3.200.3"
4
+ __author__ = "Aaron Swartz (me@aaronsw.com)"
5
+ __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6
+ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
7
+
8
+ # TODO:
9
+ # Support decoded entities with unifiable.
10
+
11
+ try:
12
+ True
13
+ except NameError:
14
+ setattr(__builtins__, 'True', 1)
15
+ setattr(__builtins__, 'False', 0)
16
+
17
+ def has_key(x, y):
18
+ if hasattr(x, 'has_key'): return x.has_key(y)
19
+ else: return y in x
20
+
21
+ try:
22
+ import htmlentitydefs
23
+ import urlparse
24
+ import HTMLParser
25
+ except ImportError: #Python3
26
+ import html.entities as htmlentitydefs
27
+ import urllib.parse as urlparse
28
+ import html.parser as HTMLParser
29
+ try: #Python3
30
+ import urllib.request as urllib
31
+ except:
32
+ import urllib
33
+ import optparse, re, sys, codecs, types
34
+
35
+ try: from textwrap import wrap
36
+ except: pass
37
+
38
+ # Use Unicode characters instead of their ascii psuedo-replacements
39
+ UNICODE_SNOB = 0
40
+
41
+ # Escape all special characters. Output is less readable, but avoids corner case formatting issues.
42
+ ESCAPE_SNOB = 0
43
+
44
+ # Put the links after each paragraph instead of at the end.
45
+ LINKS_EACH_PARAGRAPH = 0
46
+
47
+ # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
48
+ BODY_WIDTH = 78
49
+
50
+ # Don't show internal links (href="#local-anchor") -- corresponding link targets
51
+ # won't be visible in the plain text file anyway.
52
+ SKIP_INTERNAL_LINKS = True
53
+
54
+ # Use inline, rather than reference, formatting for images and links
55
+ INLINE_LINKS = True
56
+
57
+ # Number of pixels Google indents nested lists
58
+ GOOGLE_LIST_INDENT = 36
59
+
60
+ IGNORE_ANCHORS = False
61
+ IGNORE_IMAGES = False
62
+ IGNORE_EMPHASIS = False
63
+
64
+ ### Entity Nonsense ###
65
+
66
+ def name2cp(k):
67
+ if k == 'apos': return ord("'")
68
+ if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
69
+ return htmlentitydefs.name2codepoint[k]
70
+ else:
71
+ k = htmlentitydefs.entitydefs[k]
72
+ if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
73
+ return ord(codecs.latin_1_decode(k)[0])
74
+
75
+ unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
76
+ 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
77
+ 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
78
+ 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
79
+ 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
80
+ 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
81
+ 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
82
+ 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
83
+ 'lrm':'', 'rlm':''}
84
+
85
+ unifiable_n = {}
86
+
87
+ for k in unifiable.keys():
88
+ unifiable_n[name2cp(k)] = unifiable[k]
89
+
90
+ ### End Entity Nonsense ###
91
+
92
+ def onlywhite(line):
93
+ """Return true if the line does only consist of whitespace characters."""
94
+ for c in line:
95
+ if c is not ' ' and c is not ' ':
96
+ return c is ' '
97
+ return line
98
+
99
+ def hn(tag):
100
+ if tag[0] == 'h' and len(tag) == 2:
101
+ try:
102
+ n = int(tag[1])
103
+ if n in range(1, 10): return n
104
+ except ValueError: return 0
105
+
106
+ def dumb_property_dict(style):
107
+ """returns a hash of css attributes"""
108
+ return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
109
+
110
+ def dumb_css_parser(data):
111
+ """returns a hash of css selectors, each of which contains a hash of css attributes"""
112
+ # remove @import sentences
113
+ data += ';'
114
+ importIndex = data.find('@import')
115
+ while importIndex != -1:
116
+ data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
117
+ importIndex = data.find('@import')
118
+
119
+ # parse the css. reverted from dictionary compehension in order to support older pythons
120
+ elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
121
+ try:
122
+ elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
123
+ except ValueError:
124
+ elements = {} # not that important
125
+
126
+ return elements
127
+
128
+ def element_style(attrs, style_def, parent_style):
129
+ """returns a hash of the 'final' style attributes of the element"""
130
+ style = parent_style.copy()
131
+ if 'class' in attrs:
132
+ for css_class in attrs['class'].split():
133
+ css_style = style_def['.' + css_class]
134
+ style.update(css_style)
135
+ if 'style' in attrs:
136
+ immediate_style = dumb_property_dict(attrs['style'])
137
+ style.update(immediate_style)
138
+ return style
139
+
140
+ def google_list_style(style):
141
+ """finds out whether this is an ordered or unordered list"""
142
+ if 'list-style-type' in style:
143
+ list_style = style['list-style-type']
144
+ if list_style in ['disc', 'circle', 'square', 'none']:
145
+ return 'ul'
146
+ return 'ol'
147
+
148
+ def google_has_height(style):
149
+ """check if the style of the element has the 'height' attribute explicitly defined"""
150
+ if 'height' in style:
151
+ return True
152
+ return False
153
+
154
+ def google_text_emphasis(style):
155
+ """return a list of all emphasis modifiers of the element"""
156
+ emphasis = []
157
+ if 'text-decoration' in style:
158
+ emphasis.append(style['text-decoration'])
159
+ if 'font-style' in style:
160
+ emphasis.append(style['font-style'])
161
+ if 'font-weight' in style:
162
+ emphasis.append(style['font-weight'])
163
+ return emphasis
164
+
165
+ def google_fixed_width_font(style):
166
+ """check if the css of the current element defines a fixed width font"""
167
+ font_family = ''
168
+ if 'font-family' in style:
169
+ font_family = style['font-family']
170
+ if 'Courier New' == font_family or 'Consolas' == font_family:
171
+ return True
172
+ return False
173
+
174
+ def list_numbering_start(attrs):
175
+ """extract numbering from list element attributes"""
176
+ if 'start' in attrs:
177
+ return int(attrs['start']) - 1
178
+ else:
179
+ return 0
180
+
181
+ class HTML2Text(HTMLParser.HTMLParser):
182
+ def __init__(self, out=None, baseurl=''):
183
+ HTMLParser.HTMLParser.__init__(self)
184
+
185
+ # Config options
186
+ self.unicode_snob = UNICODE_SNOB
187
+ self.escape_snob = ESCAPE_SNOB
188
+ self.links_each_paragraph = LINKS_EACH_PARAGRAPH
189
+ self.body_width = BODY_WIDTH
190
+ self.skip_internal_links = SKIP_INTERNAL_LINKS
191
+ self.inline_links = INLINE_LINKS
192
+ self.google_list_indent = GOOGLE_LIST_INDENT
193
+ self.ignore_links = IGNORE_ANCHORS
194
+ self.ignore_images = IGNORE_IMAGES
195
+ self.ignore_emphasis = IGNORE_EMPHASIS
196
+ self.google_doc = False
197
+ self.ul_item_mark = '*'
198
+ self.emphasis_mark = '_'
199
+ self.strong_mark = '**'
200
+
201
+ if out is None:
202
+ self.out = self.outtextf
203
+ else:
204
+ self.out = out
205
+
206
+ self.outtextlist = [] # empty list to store output characters before they are "joined"
207
+
208
+ try:
209
+ self.outtext = unicode()
210
+ except NameError: # Python3
211
+ self.outtext = str()
212
+
213
+ self.quiet = 0
214
+ self.p_p = 0 # number of newline character to print before next output
215
+ self.outcount = 0
216
+ self.start = 1
217
+ self.space = 0
218
+ self.a = []
219
+ self.astack = []
220
+ self.maybe_automatic_link = None
221
+ self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
222
+ self.acount = 0
223
+ self.list = []
224
+ self.blockquote = 0
225
+ self.pre = 0
226
+ self.startpre = 0
227
+ self.code = False
228
+ self.br_toggle = ''
229
+ self.lastWasNL = 0
230
+ self.lastWasList = False
231
+ self.style = 0
232
+ self.style_def = {}
233
+ self.tag_stack = []
234
+ self.emphasis = 0
235
+ self.drop_white_space = 0
236
+ self.inheader = False
237
+ self.abbr_title = None # current abbreviation definition
238
+ self.abbr_data = None # last inner HTML (for abbr being defined)
239
+ self.abbr_list = {} # stack of abbreviations to write later
240
+ self.baseurl = baseurl
241
+
242
+ try: del unifiable_n[name2cp('nbsp')]
243
+ except KeyError: pass
244
+ unifiable['nbsp'] = '&nbsp_place_holder;'
245
+
246
+
247
+ def feed(self, data):
248
+ data = data.replace("</' + 'script>", "</ignore>")
249
+ HTMLParser.HTMLParser.feed(self, data)
250
+
251
+ def handle(self, data):
252
+ self.feed(data)
253
+ self.feed("")
254
+ return self.optwrap(self.close())
255
+
256
+ def outtextf(self, s):
257
+ self.outtextlist.append(s)
258
+ if s: self.lastWasNL = s[-1] == '\n'
259
+
260
+ def close(self):
261
+ HTMLParser.HTMLParser.close(self)
262
+
263
+ self.pbr()
264
+ self.o('', 0, 'end')
265
+
266
+ self.outtext = self.outtext.join(self.outtextlist)
267
+ if self.unicode_snob:
268
+ nbsp = unichr(name2cp('nbsp'))
269
+ else:
270
+ nbsp = u' '
271
+ self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)
272
+
273
+ return self.outtext
274
+
275
+ def handle_charref(self, c):
276
+ self.o(self.charref(c), 1)
277
+
278
+ def handle_entityref(self, c):
279
+ self.o(self.entityref(c), 1)
280
+
281
+ def handle_starttag(self, tag, attrs):
282
+ self.handle_tag(tag, attrs, 1)
283
+
284
+ def handle_endtag(self, tag):
285
+ self.handle_tag(tag, None, 0)
286
+
287
+ def previousIndex(self, attrs):
288
+ """ returns the index of certain set of attributes (of a link) in the
289
+ self.a list
290
+
291
+ If the set of attributes is not found, returns None
292
+ """
293
+ if not has_key(attrs, 'href'): return None
294
+
295
+ i = -1
296
+ for a in self.a:
297
+ i += 1
298
+ match = 0
299
+
300
+ if has_key(a, 'href') and a['href'] == attrs['href']:
301
+ if has_key(a, 'title') or has_key(attrs, 'title'):
302
+ if (has_key(a, 'title') and has_key(attrs, 'title') and
303
+ a['title'] == attrs['title']):
304
+ match = True
305
+ else:
306
+ match = True
307
+
308
+ if match: return i
309
+
310
+ def drop_last(self, nLetters):
311
+ if not self.quiet:
312
+ self.outtext = self.outtext[:-nLetters]
313
+
314
+ def handle_emphasis(self, start, tag_style, parent_style):
315
+ """handles various text emphases"""
316
+ tag_emphasis = google_text_emphasis(tag_style)
317
+ parent_emphasis = google_text_emphasis(parent_style)
318
+
319
+ # handle Google's text emphasis
320
+ strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough
321
+ bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
322
+ italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
323
+ fixed = google_fixed_width_font(tag_style) and not \
324
+ google_fixed_width_font(parent_style) and not self.pre
325
+
326
+ if start:
327
+ # crossed-out text must be handled before other attributes
328
+ # in order not to output qualifiers unnecessarily
329
+ if bold or italic or fixed:
330
+ self.emphasis += 1
331
+ if strikethrough:
332
+ self.quiet += 1
333
+ if italic:
334
+ self.o(self.emphasis_mark)
335
+ self.drop_white_space += 1
336
+ if bold:
337
+ self.o(self.strong_mark)
338
+ self.drop_white_space += 1
339
+ if fixed:
340
+ self.o('`')
341
+ self.drop_white_space += 1
342
+ self.code = True
343
+ else:
344
+ if bold or italic or fixed:
345
+ # there must not be whitespace before closing emphasis mark
346
+ self.emphasis -= 1
347
+ self.space = 0
348
+ self.outtext = self.outtext.rstrip()
349
+ if fixed:
350
+ if self.drop_white_space:
351
+ # empty emphasis, drop it
352
+ self.drop_last(1)
353
+ self.drop_white_space -= 1
354
+ else:
355
+ self.o('`')
356
+ self.code = False
357
+ if bold:
358
+ if self.drop_white_space:
359
+ # empty emphasis, drop it
360
+ self.drop_last(2)
361
+ self.drop_white_space -= 1
362
+ else:
363
+ self.o(self.strong_mark)
364
+ if italic:
365
+ if self.drop_white_space:
366
+ # empty emphasis, drop it
367
+ self.drop_last(1)
368
+ self.drop_white_space -= 1
369
+ else:
370
+ self.o(self.emphasis_mark)
371
+ # space is only allowed after *all* emphasis marks
372
+ if (bold or italic) and not self.emphasis:
373
+ self.o(" ")
374
+ if strikethrough:
375
+ self.quiet -= 1
376
+
377
+ def handle_tag(self, tag, attrs, start):
378
+ #attrs = fixattrs(attrs)
379
+ if attrs is None:
380
+ attrs = {}
381
+ else:
382
+ attrs = dict(attrs)
383
+
384
+ if self.google_doc:
385
+ # the attrs parameter is empty for a closing tag. in addition, we
386
+ # need the attributes of the parent nodes in order to get a
387
+ # complete style description for the current element. we assume
388
+ # that google docs export well formed html.
389
+ parent_style = {}
390
+ if start:
391
+ if self.tag_stack:
392
+ parent_style = self.tag_stack[-1][2]
393
+ tag_style = element_style(attrs, self.style_def, parent_style)
394
+ self.tag_stack.append((tag, attrs, tag_style))
395
+ else:
396
+ dummy, attrs, tag_style = self.tag_stack.pop()
397
+ if self.tag_stack:
398
+ parent_style = self.tag_stack[-1][2]
399
+
400
+ if hn(tag):
401
+ self.p()
402
+ if start:
403
+ self.inheader = True
404
+ self.o(hn(tag)*"#" + ' ')
405
+ else:
406
+ self.inheader = False
407
+ return # prevent redundant emphasis marks on headers
408
+
409
+ if tag in ['p', 'div']:
410
+ if self.google_doc:
411
+ if start and google_has_height(tag_style):
412
+ self.p()
413
+ else:
414
+ self.soft_br()
415
+ else:
416
+ self.p()
417
+
418
+ if tag == "br" and start: self.o(" \n")
419
+
420
+ if tag == "hr" and start:
421
+ self.p()
422
+ self.o("* * *")
423
+ self.p()
424
+
425
+ if tag in ["head", "style", 'script']:
426
+ if start: self.quiet += 1
427
+ else: self.quiet -= 1
428
+
429
+ if tag == "style":
430
+ if start: self.style += 1
431
+ else: self.style -= 1
432
+
433
+ if tag in ["body"]:
434
+ self.quiet = 0 # sites like 9rules.com never close <head>
435
+
436
+ if tag == "blockquote":
437
+ if start:
438
+ self.p(); self.o('> ', 0, 1); self.start = 1
439
+ self.blockquote += 1
440
+ else:
441
+ self.blockquote -= 1
442
+ self.p()
443
+
444
+ if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
445
+ if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
446
+ if tag in ['del', 'strike', 's']:
447
+ if start:
448
+ self.o("<"+tag+">")
449
+ else:
450
+ self.o("</"+tag+">")
451
+
452
+ if self.google_doc:
453
+ if not self.inheader:
454
+ # handle some font attributes, but leave headers clean
455
+ self.handle_emphasis(start, tag_style, parent_style)
456
+
457
+ if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
458
+ if tag == "abbr":
459
+ if start:
460
+ self.abbr_title = None
461
+ self.abbr_data = ''
462
+ if has_key(attrs, 'title'):
463
+ self.abbr_title = attrs['title']
464
+ else:
465
+ if self.abbr_title != None:
466
+ self.abbr_list[self.abbr_data] = self.abbr_title
467
+ self.abbr_title = None
468
+ self.abbr_data = ''
469
+
470
+ if tag == "a" and not self.ignore_links:
471
+ if start:
472
+ if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
473
+ self.astack.append(attrs)
474
+ self.maybe_automatic_link = attrs['href']
475
+ else:
476
+ self.astack.append(None)
477
+ else:
478
+ if self.astack:
479
+ a = self.astack.pop()
480
+ if self.maybe_automatic_link:
481
+ self.maybe_automatic_link = None
482
+ elif a:
483
+ if self.inline_links:
484
+ self.o("](" + escape_md(a['href']) + ")")
485
+ else:
486
+ i = self.previousIndex(a)
487
+ if i is not None:
488
+ a = self.a[i]
489
+ else:
490
+ self.acount += 1
491
+ a['count'] = self.acount
492
+ a['outcount'] = self.outcount
493
+ self.a.append(a)
494
+ self.o("][" + str(a['count']) + "]")
495
+
496
+ if tag == "img" and start and not self.ignore_images:
497
+ if has_key(attrs, 'src'):
498
+ attrs['href'] = attrs['src']
499
+ alt = attrs.get('alt', '')
500
+ self.o("![" + escape_md(alt) + "]")
501
+
502
+ if self.inline_links:
503
+ self.o("(" + escape_md(attrs['href']) + ")")
504
+ else:
505
+ i = self.previousIndex(attrs)
506
+ if i is not None:
507
+ attrs = self.a[i]
508
+ else:
509
+ self.acount += 1
510
+ attrs['count'] = self.acount
511
+ attrs['outcount'] = self.outcount
512
+ self.a.append(attrs)
513
+ self.o("[" + str(attrs['count']) + "]")
514
+
515
+ if tag == 'dl' and start: self.p()
516
+ if tag == 'dt' and not start: self.pbr()
517
+ if tag == 'dd' and start: self.o(' ')
518
+ if tag == 'dd' and not start: self.pbr()
519
+
520
+ if tag in ["ol", "ul"]:
521
+ # Google Docs create sub lists as top level lists
522
+ if (not self.list) and (not self.lastWasList):
523
+ self.p()
524
+ if start:
525
+ if self.google_doc:
526
+ list_style = google_list_style(tag_style)
527
+ else:
528
+ list_style = tag
529
+ numbering_start = list_numbering_start(attrs)
530
+ self.list.append({'name':list_style, 'num':numbering_start})
531
+ else:
532
+ if self.list: self.list.pop()
533
+ self.lastWasList = True
534
+ else:
535
+ self.lastWasList = False
536
+
537
+ if tag == 'li':
538
+ self.pbr()
539
+ if start:
540
+ if self.list: li = self.list[-1]
541
+ else: li = {'name':'ul', 'num':0}
542
+ if self.google_doc:
543
+ nest_count = self.google_nest_count(tag_style)
544
+ else:
545
+ nest_count = len(self.list)
546
+ self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
547
+ if li['name'] == "ul": self.o(self.ul_item_mark + " ")
548
+ elif li['name'] == "ol":
549
+ li['num'] += 1
550
+ self.o(str(li['num'])+". ")
551
+ self.start = 1
552
+
553
+ if tag in ["table", "tr"] and start: self.p()
554
+ if tag == 'td': self.pbr()
555
+
556
+ if tag == "pre":
557
+ if start:
558
+ self.startpre = 1
559
+ self.pre = 1
560
+ else:
561
+ self.pre = 0
562
+ self.p()
563
+
564
+ def pbr(self):
565
+ if self.p_p == 0:
566
+ self.p_p = 1
567
+
568
+ def p(self):
569
+ self.p_p = 2
570
+
571
+ def soft_br(self):
572
+ self.pbr()
573
+ self.br_toggle = ' '
574
+
575
+ def o(self, data, puredata=0, force=0):
576
+ if self.abbr_data is not None:
577
+ self.abbr_data += data
578
+
579
+ if not self.quiet:
580
+ if self.google_doc:
581
+ # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
582
+ lstripped_data = data.lstrip()
583
+ if self.drop_white_space and not (self.pre or self.code):
584
+ data = lstripped_data
585
+ if lstripped_data != '':
586
+ self.drop_white_space = 0
587
+
588
+ if puredata and not self.pre:
589
+ data = re.sub('\s+', ' ', data)
590
+ if data and data[0] == ' ':
591
+ self.space = 1
592
+ data = data[1:]
593
+ if not data and not force: return
594
+
595
+ if self.startpre:
596
+ #self.out(" :") #TODO: not output when already one there
597
+ if not data.startswith("\n"): # <pre>stuff...
598
+ data = "\n" + data
599
+
600
+ bq = (">" * self.blockquote)
601
+ if not (force and data and data[0] == ">") and self.blockquote: bq += " "
602
+
603
+ if self.pre:
604
+ if not self.list:
605
+ bq += " "
606
+ #else: list content is already partially indented
607
+ for i in xrange(len(self.list)):
608
+ bq += " "
609
+ data = data.replace("\n", "\n"+bq)
610
+
611
+ if self.startpre:
612
+ self.startpre = 0
613
+ if self.list:
614
+ data = data.lstrip("\n") # use existing initial indentation
615
+
616
+ if self.start:
617
+ self.space = 0
618
+ self.p_p = 0
619
+ self.start = 0
620
+
621
+ if force == 'end':
622
+ # It's the end.
623
+ self.p_p = 0
624
+ self.out("\n")
625
+ self.space = 0
626
+
627
+ if self.p_p:
628
+ self.out((self.br_toggle+'\n'+bq)*self.p_p)
629
+ self.space = 0
630
+ self.br_toggle = ''
631
+
632
+ if self.space:
633
+ if not self.lastWasNL: self.out(' ')
634
+ self.space = 0
635
+
636
+ if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
637
+ if force == "end": self.out("\n")
638
+
639
+ newa = []
640
+ for link in self.a:
641
+ if self.outcount > link['outcount']:
642
+ self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
643
+ if has_key(link, 'title'): self.out(" ("+link['title']+")")
644
+ self.out("\n")
645
+ else:
646
+ newa.append(link)
647
+
648
+ if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
649
+
650
+ self.a = newa
651
+
652
+ if self.abbr_list and force == "end":
653
+ for abbr, definition in self.abbr_list.items():
654
+ self.out(" *[" + abbr + "]: " + definition + "\n")
655
+
656
+ self.p_p = 0
657
+ self.out(data)
658
+ self.outcount += 1
659
+
660
+ def handle_data(self, data):
661
+ if r'\/script>' in data: self.quiet -= 1
662
+
663
+ if self.style:
664
+ self.style_def.update(dumb_css_parser(data))
665
+
666
+ if not self.maybe_automatic_link is None:
667
+ href = self.maybe_automatic_link
668
+ if href == data and self.absolute_url_matcher.match(href):
669
+ self.o("<" + data + ">")
670
+ return
671
+ else:
672
+ self.o("[")
673
+ self.maybe_automatic_link = None
674
+
675
+ if not self.code and not self.pre:
676
+ data = escape_md_section(data, snob=self.escape_snob)
677
+ self.o(data, 1)
678
+
679
+ def unknown_decl(self, data): pass
680
+
681
+ def charref(self, name):
682
+ if name[0] in ['x','X']:
683
+ c = int(name[1:], 16)
684
+ else:
685
+ c = int(name)
686
+
687
+ if not self.unicode_snob and c in unifiable_n.keys():
688
+ return unifiable_n[c]
689
+ else:
690
+ try:
691
+ return unichr(c)
692
+ except NameError: #Python3
693
+ return chr(c)
694
+
695
+ def entityref(self, c):
696
+ if not self.unicode_snob and c in unifiable.keys():
697
+ return unifiable[c]
698
+ else:
699
+ try: name2cp(c)
700
+ except KeyError: return "&" + c + ';'
701
+ else:
702
+ try:
703
+ return unichr(name2cp(c))
704
+ except NameError: #Python3
705
+ return chr(name2cp(c))
706
+
707
+ def replaceEntities(self, s):
708
+ s = s.group(1)
709
+ if s[0] == "#":
710
+ return self.charref(s[1:])
711
+ else: return self.entityref(s)
712
+
713
+ r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
714
+ def unescape(self, s):
715
+ return self.r_unescape.sub(self.replaceEntities, s)
716
+
717
+ def google_nest_count(self, style):
718
+ """calculate the nesting count of google doc lists"""
719
+ nest_count = 0
720
+ if 'margin-left' in style:
721
+ nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
722
+ return nest_count
723
+
724
+
725
+ def optwrap(self, text):
726
+ """Wrap all paragraphs in the provided text."""
727
+ if not self.body_width:
728
+ return text
729
+
730
+ assert wrap, "Requires Python 2.3."
731
+ result = ''
732
+ newlines = 0
733
+ for para in text.split("\n"):
734
+ if len(para) > 0:
735
+ if not skipwrap(para):
736
+ result += "\n".join(wrap(para, self.body_width))
737
+ if para.endswith(' '):
738
+ result += " \n"
739
+ newlines = 1
740
+ else:
741
+ result += "\n\n"
742
+ newlines = 2
743
+ else:
744
+ if not onlywhite(para):
745
+ result += para + "\n"
746
+ newlines = 1
747
+ else:
748
+ if newlines < 2:
749
+ result += "\n"
750
+ newlines += 1
751
+ return result
752
+
753
+ ordered_list_matcher = re.compile(r'\d+\.\s')
754
+ unordered_list_matcher = re.compile(r'[-\*\+]\s')
755
+ md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
756
+ md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
757
+ md_dot_matcher = re.compile(r"""
758
+ ^ # start of line
759
+ (\s*\d+) # optional whitespace and a number
760
+ (\.) # dot
761
+ (?=\s) # lookahead assert whitespace
762
+ """, re.MULTILINE | re.VERBOSE)
763
+ md_plus_matcher = re.compile(r"""
764
+ ^
765
+ (\s*)
766
+ (\+)
767
+ (?=\s)
768
+ """, flags=re.MULTILINE | re.VERBOSE)
769
+ md_dash_matcher = re.compile(r"""
770
+ ^
771
+ (\s*)
772
+ (-)
773
+ (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
774
+ # or another dash (header or hr)
775
+ """, flags=re.MULTILINE | re.VERBOSE)
776
+ slash_chars = r'\`*_{}[]()#+-.!'
777
+ md_backslash_matcher = re.compile(r'''
778
+ (\\) # match one slash
779
+ (?=[%s]) # followed by a char that requires escaping
780
+ ''' % re.escape(slash_chars),
781
+ flags=re.VERBOSE)
782
+
783
+ def skipwrap(para):
784
+ # If the text begins with four spaces or one tab, it's a code block; don't wrap
785
+ if para[0:4] == ' ' or para[0] == '\t':
786
+ return True
787
+ # If the text begins with only two "--", possibly preceded by whitespace, that's
788
+ # an emdash; so wrap.
789
+ stripped = para.lstrip()
790
+ if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
791
+ return False
792
+ # I'm not sure what this is for; I thought it was to detect lists, but there's
793
+ # a <br>-inside-<span> case in one of the tests that also depends upon it.
794
+ if stripped[0:1] == '-' or stripped[0:1] == '*':
795
+ return True
796
+ # If the text begins with a single -, *, or +, followed by a space, or an integer,
797
+ # followed by a ., followed by a space (in either case optionally preceeded by
798
+ # whitespace), it's a list; don't wrap.
799
+ if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
800
+ return True
801
+ return False
802
+
803
+ def wrapwrite(text):
804
+ text = text.encode('utf-8')
805
+ try: #Python3
806
+ sys.stdout.buffer.write(text)
807
+ except AttributeError:
808
+ sys.stdout.write(text)
809
+
810
+ def html2text(html, baseurl=''):
811
+ h = HTML2Text(baseurl=baseurl)
812
+ return h.handle(html)
813
+
814
+ def unescape(s, unicode_snob=False):
815
+ h = HTML2Text()
816
+ h.unicode_snob = unicode_snob
817
+ return h.unescape(s)
818
+
819
+ def escape_md(text):
820
+ """Escapes markdown-sensitive characters within other markdown constructs."""
821
+ return md_chars_matcher.sub(r"\\\1", text)
822
+
823
+ def escape_md_section(text, snob=False):
824
+ """Escapes markdown-sensitive characters across whole document sections."""
825
+ text = md_backslash_matcher.sub(r"\\\1", text)
826
+ if snob:
827
+ text = md_chars_matcher_all.sub(r"\\\1", text)
828
+ text = md_dot_matcher.sub(r"\1\\\2", text)
829
+ text = md_plus_matcher.sub(r"\1\\\2", text)
830
+ text = md_dash_matcher.sub(r"\1\\\2", text)
831
+ return text
832
+
833
+
834
+ def main():
835
+ baseurl = ''
836
+
837
+ p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
838
+ version='%prog ' + __version__)
839
+ p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
840
+ default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
841
+ p.add_option("--ignore-links", dest="ignore_links", action="store_true",
842
+ default=IGNORE_ANCHORS, help="don't include any formatting for links")
843
+ p.add_option("--ignore-images", dest="ignore_images", action="store_true",
844
+ default=IGNORE_IMAGES, help="don't include any formatting for images")
845
+ p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
846
+ default=False, help="convert an html-exported Google Document")
847
+ p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
848
+ default=False, help="use a dash rather than a star for unordered list items")
849
+ p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
850
+ default=False, help="use an asterisk rather than an underscore for emphasized text")
851
+ p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
852
+ default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
853
+ p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
854
+ default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
855
+ p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
856
+ default=False, help="hide strike-through text. only relevant when -g is specified as well")
857
+ p.add_option("--escape-all", action="store_true", dest="escape_snob",
858
+ default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
859
+ (options, args) = p.parse_args()
860
+
861
+ # process input
862
+ encoding = "utf-8"
863
+ if len(args) > 0:
864
+ file_ = args[0]
865
+ if len(args) == 2:
866
+ encoding = args[1]
867
+ if len(args) > 2:
868
+ p.error('Too many arguments')
869
+
870
+ if file_.startswith('http://') or file_.startswith('https://'):
871
+ baseurl = file_
872
+ j = urllib.urlopen(baseurl)
873
+ data = j.read()
874
+ if encoding is None:
875
+ try:
876
+ from feedparser import _getCharacterEncoding as enc
877
+ except ImportError:
878
+ enc = lambda x, y: ('utf-8', 1)
879
+ encoding = enc(j.headers, data)[0]
880
+ if encoding == 'us-ascii':
881
+ encoding = 'utf-8'
882
+ else:
883
+ data = open(file_, 'rb').read()
884
+ if encoding is None:
885
+ try:
886
+ from chardet import detect
887
+ except ImportError:
888
+ detect = lambda x: {'encoding': 'utf-8'}
889
+ encoding = detect(data)['encoding']
890
+ else:
891
+ data = sys.stdin.read()
892
+
893
+ data = data.decode(encoding)
894
+ h = HTML2Text(baseurl=baseurl)
895
+ # handle options
896
+ if options.ul_style_dash: h.ul_item_mark = '-'
897
+ if options.em_style_asterisk:
898
+ h.emphasis_mark = '*'
899
+ h.strong_mark = '__'
900
+
901
+ h.body_width = options.body_width
902
+ h.list_indent = options.list_indent
903
+ h.ignore_emphasis = options.ignore_emphasis
904
+ h.ignore_links = options.ignore_links
905
+ h.ignore_images = options.ignore_images
906
+ h.google_doc = options.google_doc
907
+ h.hide_strikethrough = options.hide_strikethrough
908
+ h.escape_snob = options.escape_snob
909
+
910
+ wrapwrite(h.handle(data))
911
+
912
+
913
+ if __name__ == "__main__":
914
+ main()