habari2md 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c2fb0d61a3de12d336d24ab9b5d3e47ec7b9a098
4
+ data.tar.gz: 063bedd9049364662e8e200efbf4a2a1c97a9f46
5
+ SHA512:
6
+ metadata.gz: 17550bf8808495e447d04fe9f5475e170a28f8119319b89a5099aad97da5569b1d7a95c4cb3e7961c742536ca5851a73939053f25a1ef85c3ca255b6597fe907
7
+ data.tar.gz: 5c3a0c209e76eceb05092ed68e9fa1ab555f2bb63172357f70d068657f1e6094caa56e637916452f30fd9a0bdaa726708034cc45f6fb20a3d44024b292fa1ecb
@@ -0,0 +1,2 @@
1
+ .����-��o�W�Q����<�C�^d3��_�A�垌��{)YT�o.᎝���B�m|��y���ĂDƠ���/�����2%�*��Q��O�nY�+��\!b����ղ
2
+ j'��(s
Binary file
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in habari2md.gemspec
4
+ gemspec
@@ -0,0 +1,14 @@
1
+ Copyright (c) 2014 Arnaud Berthomier
2
+
3
+ habari2md is free software: you can redistribute it and/or modify it
4
+ under the terms of the GNU General Public License as published by the
5
+ Free Software Foundation, either version 3 of the License, or (at your
6
+ option) any later version.
7
+
8
+ habari2md is distributed in the hope that it will be useful, but WITHOUT
9
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11
+ for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
@@ -0,0 +1,64 @@
1
+ # Habari2md
2
+
3
+ This is a dirty little Ruby program to export a [Habari][habari] blog to
4
+ markdown format. I used it to avoid installing PHP on a small VPS in order to
5
+ run a tiny blog of ~2000 posts.
6
+
7
+ The program makes a few assumptions about your setup, and this conditions what
8
+ you should expect to get from it.
9
+
10
+ * It will connect to a MariaDB/MySQL database,
11
+ * fetch all of its posts and:
12
+ * dump one file per published post in the `out` directory ;
13
+ * use a filename like `YYYY-MM-DD-post-slug.md` where `YYYY-MM-DD` are the
14
+ year, month, and month day when a particular post was published ;
15
+ * and format a post header with:
16
+
17
+ ```
18
+ title: The original post title
19
+ author: The author's username
20
+ ```
21
+
22
+ This process can be pretty specific, and if it does not fit your setup, feel
23
+ free to file an issue or, better, send a pull-request. ;)
24
+
25
+ # Dependencies
26
+
27
+ * Ruby >= 1.9
28
+ * Python >= 2.x
29
+
30
+ # Installation
31
+
32
+ `gem install habari2md`
33
+
34
+ # Usage
35
+
36
+ ```
37
+ $ habari2md -h
38
+ Usage: habari2md [options]
39
+ -o, --output [DIR] Output directory
40
+ -s, --host [HOST] Database host
41
+ -d, --db [DB] Database name
42
+ -u, --user [USER] Database user
43
+ -p, --password [PASS] Database password
44
+ -h, --help Show this message
45
+ $ habari2md -o foobar -d my_blog_database -h localhost -u sql_user -p sql_password
46
+ I, [2014-01-08T23:31:20.771303 #74090] INFO -- : Exporting 12345 posts...
47
+ I, [2014-01-08T23:31:50.618731 #74090] INFO -- : 50% to go
48
+ I, [2014-01-08T23:32:20.081583 #74090] INFO -- : We're done.
49
+ D, [2014-01-08T23:32:20.083582 #74090] DEBUG -- : Terminating 6 actors...
50
+ W, [2014-01-08T23:32:20.084398 #74090] WARN -- : Terminating task: type=:finalizer, meta={:method_name=>:__shutdown__}, status=:callwait
51
+
52
+ ```
53
+
54
+ # License
55
+
56
+ GPL 3.0
57
+
58
+ Note: this distribution contains Aaron Swartz [html2text][html2text] GPL
59
+ licensed program. As a matter of fact, we fork one process to convert each post
60
+ from HTML to [Markdown][markdown], yay!
61
+
62
+ [habari]: http://habariproject.org/
63
+ [html2text]: http://www.aaronsw.com/2002/html2text/
64
+ [markdown]: http://daringfireball.net/projects/markdown/
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift(File.dirname(__FILE__) + '../lib')
4
+
5
+ require 'optparse'
6
+ require 'habari2md'
7
+
8
+ options = {
9
+ out: './out',
10
+ host: 'localhost',
11
+ user: 'root',
12
+ password: 'root',
13
+ }
14
+
15
+ OptionParser.new do |opts|
16
+ opts.banner = 'Usage: habari2md [options]'
17
+
18
+ opts.on("-o", "--output [DIR]", "Output directory") do |dir|
19
+ options[:out] = dir
20
+ end
21
+
22
+ opts.on("-s", "--host [HOST]", "Database host") do |host|
23
+ options[:host] = host
24
+ end
25
+
26
+ opts.on("-d", "--db [DB]", "Database name") do |name|
27
+ options[:db] = name
28
+ end
29
+
30
+ opts.on("-u", "--user [USER]", "Database user") do |user|
31
+ options[:user] = user
32
+ end
33
+
34
+ opts.on("-p", "--password [PASS]", "Database password") do |pass|
35
+ options[:password] = pass
36
+ end
37
+
38
+ opts.on("-h", "--help", "Show this message") do
39
+ puts opts
40
+ exit
41
+ end
42
+ end.parse!
43
+
44
+ Habari2md::Exporter.new(options).export_posts(options[:out])
@@ -0,0 +1,20 @@
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIDXDCCAkSgAwIBAgIBATANBgkqhkiG9w0BAQUFADA6MQswCQYDVQQDDAJvejEW
3
+ MBQGCgmSJomT8ixkARkWBmN5cHJpbzETMBEGCgmSJomT8ixkARkWA25ldDAeFw0x
4
+ NDAxMDkxNzE0NDVaFw0xNTAxMDkxNzE0NDVaMDoxCzAJBgNVBAMMAm96MRYwFAYK
5
+ CZImiZPyLGQBGRYGY3lwcmlvMRMwEQYKCZImiZPyLGQBGRYDbmV0MIIBIjANBgkq
6
+ hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy10tbHJlv/nomAnN23gT/9WF0Sfr/6/L
7
+ o8rkkmtFgI4gZKpY3RmmhJavlzw7Pq3hT50AN+gpacyS6GJ6NRhyR59T7EK0Mar0
8
+ 7vCJhwW8EqjCjI2LVlv5NgJsQE9aFaNvNAl8cMuuWSw3UArB2ZRKsdE1J4KBTBpw
9
+ 7agSPppFarNuHKyAXXsg2rfBmkDvfUKXE+8BccQ3ga1guhfFTAQgk8zLjE21opti
10
+ 7qZbWToBSsV6dzBxpIWVkIcX2HnXsrpE1IJbXBzy60L5kHchzn+o2BB7wemBSMvk
11
+ yOaC2KRI5Xiy/THIZhheKGAHMvbu7xbz3Wt12J+H5iRBmE+VV/IRvwIDAQABo20w
12
+ azAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUP61Rx/1umQ17mKwZ
13
+ nGNam5fTDbMwGAYDVR0RBBEwD4ENb3pAY3lwcmlvLm5ldDAYBgNVHRIEETAPgQ1v
14
+ ekBjeXByaW8ubmV0MA0GCSqGSIb3DQEBBQUAA4IBAQC0CN++vyu1zcmOyckEHCa1
15
+ sk579L0i2DRO7bma9t+Z8Th6WVQqXGen0YYxznSzCPqQHN650IItnDUkujVHMI/g
16
+ ctUmyPXUryOA6EqFi0l+t7QSRysxy/79rZCIRufhFbsNhbwWMwUAEmHmJ2BHO7g4
17
+ EEI8FdoHY2xWEZ1SBu0gzn0Kmi5u1I6/i3NmvKchmIK3eQcPtu0xwSuFEw7SINcu
18
+ hfXfqFqS3mCcykIEz+V7ZRcIaiQse+263YcyYSYRws3EvEQH7C7XnUF7/Y6TpwnI
19
+ QDKpCyE1PBhKqihfimirfnkLKw1ZaUY9Nd8UpOopW8pA3eUdUqo0yJe6IQ7s8LyR
20
+ -----END CERTIFICATE-----
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'habari2md/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "habari2md"
8
+ spec.version = Habari2md::VERSION
9
+ spec.authors = ["Arnaud Berthomier"]
10
+ spec.email = ["oz@cyprio.net"]
11
+ spec.summary = %q{Habari to markdown}
12
+ spec.description = %q{Dump a Habari blog posts to Markdown format}
13
+ spec.homepage = "https://github.com/oz/habari2md"
14
+ spec.license = "GPL v3"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = []
19
+ spec.require_paths = ["lib"]
20
+ spec.cert_chain = ["certs/oz.pem"]
21
+ spec.signing_key = File.expand_path("~/.ssh/gem-private_key.pem") if $0 =~ /gem\z/
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.5"
24
+ spec.add_development_dependency "rake"
25
+
26
+ spec.add_dependency "celluloid", "~> 0.15"
27
+ spec.add_dependency "sequel", "~> 4.5"
28
+ spec.add_dependency "mysql", "~> 2.9"
29
+ end
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env ruby
2
+ require 'fileutils'
3
+ require 'pathname'
4
+
5
+ require 'celluloid'
6
+ require 'sequel'
7
+
8
+ module Habari2md
9
+ # @class Habari2md::Text Text helpers
10
+ class Text
11
+ # Shameless snatch from Rails.
12
+ # @param [String] text
13
+ # @return [String]
14
+ def self.simple_format(text)
15
+ text = '' if text.nil?
16
+ text = text.dup
17
+ start_tag = '<p>'
18
+ text = text.to_str
19
+ text.gsub!(/\r\n?/, "\n") # \r\n and \r -> \n
20
+ text.gsub!(/\n\n+/, "</p>\n\n#{start_tag}") # 2+ newline -> paragraph
21
+ text.gsub!(/([^\n]\n)(?=[^\n])/, '\1<br />') # 1 newline -> br
22
+ text.insert(0, start_tag)
23
+ text << '</p>'
24
+ return text
25
+ end
26
+
27
+ # Fork (!) html2text.py to convert form HTML to Markdown.
28
+ #
29
+ # @param [String] content
30
+ # @reutnr [String] Markdown content
31
+ def self.html2text(content)
32
+ IO.popen(html2text_script, "r+") do |io|
33
+ io.write content
34
+ io.close_write
35
+ content = io.read
36
+ io.close_read
37
+ end
38
+ content
39
+ end
40
+
41
+ protected
42
+
43
+ def self.html2text_script
44
+ @html2text ||= Pathname.new(File.dirname(__FILE__))
45
+ .join('vendor', 'html2text.py').to_s
46
+ end
47
+ end
48
+
49
+ # @class Habari2md::Exporter
50
+ # @example Export stuff
51
+ # worker = Habari2md::Exporter.new(db: 'foo', user: 'root')
52
+ # worker.export_posts("./out")
53
+ class Exporter
54
+ attr_reader :db
55
+ include Celluloid
56
+ include Celluloid::Logger
57
+
58
+ def initialize(opts = {})
59
+ @db = Sequel.connect(db_uri opts)
60
+ @counter = 0
61
+ @halfway = 0
62
+
63
+ # Cache users
64
+ @users = @db[:users].all.inject({}) do |cache, user|
65
+ cache.merge!([user[:id]] => user)
66
+ end
67
+ end
68
+
69
+ def posts
70
+ db[:posts].order(:modified)
71
+ end
72
+
73
+ # @return [Hash]
74
+ def user(id)
75
+ @users.fetch(id, {})
76
+ end
77
+
78
+ def export_posts(directory)
79
+ FileUtils.mkdir_p(directory) unless File.directory?(directory)
80
+
81
+ @counter = posts.count
82
+ @halfway = @counter / 2
83
+
84
+ info "Exporting #{@counter} posts..."
85
+
86
+ pool = Habari2md::PostExporter.pool(args: [directory, current_actor])
87
+ posts.each { |post| pool.async.export(post) }
88
+
89
+ wait(:done)
90
+ info "We're done."
91
+ end
92
+
93
+ # Called by PostExport when an export operation has finished.
94
+ def post_exported(post_id)
95
+ @counter -= 1
96
+ info "50% to go" if @counter == @halfway
97
+ signal(:done) if @counter == 0
98
+ end
99
+
100
+ protected
101
+
102
+ def db_uri(opts)
103
+ "mysql://#{opts[:user]}:#{opts[:password]}@#{opts[:host]}/#{opts[:db]}"
104
+ end
105
+ end
106
+
107
+ # @class Habari2md::PostExporter Export one post
108
+ class PostExporter
109
+ include Celluloid
110
+
111
+ # Output directory
112
+ attr_reader :dir
113
+
114
+ # Manager actor
115
+ attr_reader :manager
116
+
117
+ def initialize(dest_dir, manager_actor)
118
+ @dir = Pathname.new(dest_dir)
119
+ @manager = manager_actor
120
+ end
121
+
122
+ # Placeholder title for untitled posts
123
+ def untitled
124
+ "Untitled"
125
+ end
126
+
127
+ # Signal the managing actor when a post has been exported
128
+ def done(post = {})
129
+ manager.post_exported(post[:id])
130
+ end
131
+
132
+ # Export one post to disk
133
+ # @param [Hash] post
134
+ def export(post)
135
+ # Ignore deleted posts and drafts.
136
+ return done(post) unless published?(post)
137
+
138
+ author = manager.user(post[:user_id])[:username]
139
+ title = post[:title].gsub(/[\r\n]/, '')
140
+ title = untitled if title == ""
141
+ date = Time.strptime(post[:pubdate].to_s, "%s").strftime("%Y-%m-%d")
142
+ filename = dir.join("#{date}-#{post[:slug]}.md")
143
+ return done(post) if File.exists?(filename) && ENV['FORCE'] == nil
144
+
145
+ # Make sure content is at least formatted with <p> tags before
146
+ # conversion.
147
+ content = Habari2md::Text.simple_format(post[:content])
148
+ File.open(filename, 'w+') do |fh|
149
+ fh << "---\n"
150
+ fh << "title: #{title}\n"
151
+ fh << "author: #{author}\n" unless author == nil
152
+ fh << "---\n\n"
153
+ fh << Habari2md::Text.html2text(content)
154
+ end
155
+
156
+ done(post)
157
+ end
158
+
159
+ # This actually depends on the values in the poststatus table.
160
+ def published?(post)
161
+ post[:status] == 2
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,3 @@
1
+ module Habari2md
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,914 @@
1
+ #!/usr/bin/env python
2
+ """html2text: Turn HTML into equivalent Markdown-structured text."""
3
+ __version__ = "3.200.3"
4
+ __author__ = "Aaron Swartz (me@aaronsw.com)"
5
+ __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6
+ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
7
+
8
+ # TODO:
9
+ # Support decoded entities with unifiable.
10
+
11
+ try:
12
+ True
13
+ except NameError:
14
+ setattr(__builtins__, 'True', 1)
15
+ setattr(__builtins__, 'False', 0)
16
+
17
+ def has_key(x, y):
18
+ if hasattr(x, 'has_key'): return x.has_key(y)
19
+ else: return y in x
20
+
21
+ try:
22
+ import htmlentitydefs
23
+ import urlparse
24
+ import HTMLParser
25
+ except ImportError: #Python3
26
+ import html.entities as htmlentitydefs
27
+ import urllib.parse as urlparse
28
+ import html.parser as HTMLParser
29
+ try: #Python3
30
+ import urllib.request as urllib
31
+ except:
32
+ import urllib
33
+ import optparse, re, sys, codecs, types
34
+
35
+ try: from textwrap import wrap
36
+ except: pass
37
+
38
+ # Use Unicode characters instead of their ascii psuedo-replacements
39
+ UNICODE_SNOB = 0
40
+
41
+ # Escape all special characters. Output is less readable, but avoids corner case formatting issues.
42
+ ESCAPE_SNOB = 0
43
+
44
+ # Put the links after each paragraph instead of at the end.
45
+ LINKS_EACH_PARAGRAPH = 0
46
+
47
+ # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
48
+ BODY_WIDTH = 78
49
+
50
+ # Don't show internal links (href="#local-anchor") -- corresponding link targets
51
+ # won't be visible in the plain text file anyway.
52
+ SKIP_INTERNAL_LINKS = True
53
+
54
+ # Use inline, rather than reference, formatting for images and links
55
+ INLINE_LINKS = True
56
+
57
+ # Number of pixels Google indents nested lists
58
+ GOOGLE_LIST_INDENT = 36
59
+
60
+ IGNORE_ANCHORS = False
61
+ IGNORE_IMAGES = False
62
+ IGNORE_EMPHASIS = False
63
+
64
+ ### Entity Nonsense ###
65
+
66
+ def name2cp(k):
67
+ if k == 'apos': return ord("'")
68
+ if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
69
+ return htmlentitydefs.name2codepoint[k]
70
+ else:
71
+ k = htmlentitydefs.entitydefs[k]
72
+ if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
73
+ return ord(codecs.latin_1_decode(k)[0])
74
+
75
+ unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
76
+ 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
77
+ 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
78
+ 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
79
+ 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
80
+ 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
81
+ 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
82
+ 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
83
+ 'lrm':'', 'rlm':''}
84
+
85
+ unifiable_n = {}
86
+
87
+ for k in unifiable.keys():
88
+ unifiable_n[name2cp(k)] = unifiable[k]
89
+
90
+ ### End Entity Nonsense ###
91
+
92
+ def onlywhite(line):
93
+ """Return true if the line does only consist of whitespace characters."""
94
+ for c in line:
95
+ if c is not ' ' and c is not ' ':
96
+ return c is ' '
97
+ return line
98
+
99
+ def hn(tag):
100
+ if tag[0] == 'h' and len(tag) == 2:
101
+ try:
102
+ n = int(tag[1])
103
+ if n in range(1, 10): return n
104
+ except ValueError: return 0
105
+
106
+ def dumb_property_dict(style):
107
+ """returns a hash of css attributes"""
108
+ return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
109
+
110
+ def dumb_css_parser(data):
111
+ """returns a hash of css selectors, each of which contains a hash of css attributes"""
112
+ # remove @import sentences
113
+ data += ';'
114
+ importIndex = data.find('@import')
115
+ while importIndex != -1:
116
+ data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
117
+ importIndex = data.find('@import')
118
+
119
+ # parse the css. reverted from dictionary compehension in order to support older pythons
120
+ elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
121
+ try:
122
+ elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
123
+ except ValueError:
124
+ elements = {} # not that important
125
+
126
+ return elements
127
+
128
+ def element_style(attrs, style_def, parent_style):
129
+ """returns a hash of the 'final' style attributes of the element"""
130
+ style = parent_style.copy()
131
+ if 'class' in attrs:
132
+ for css_class in attrs['class'].split():
133
+ css_style = style_def['.' + css_class]
134
+ style.update(css_style)
135
+ if 'style' in attrs:
136
+ immediate_style = dumb_property_dict(attrs['style'])
137
+ style.update(immediate_style)
138
+ return style
139
+
140
+ def google_list_style(style):
141
+ """finds out whether this is an ordered or unordered list"""
142
+ if 'list-style-type' in style:
143
+ list_style = style['list-style-type']
144
+ if list_style in ['disc', 'circle', 'square', 'none']:
145
+ return 'ul'
146
+ return 'ol'
147
+
148
+ def google_has_height(style):
149
+ """check if the style of the element has the 'height' attribute explicitly defined"""
150
+ if 'height' in style:
151
+ return True
152
+ return False
153
+
154
+ def google_text_emphasis(style):
155
+ """return a list of all emphasis modifiers of the element"""
156
+ emphasis = []
157
+ if 'text-decoration' in style:
158
+ emphasis.append(style['text-decoration'])
159
+ if 'font-style' in style:
160
+ emphasis.append(style['font-style'])
161
+ if 'font-weight' in style:
162
+ emphasis.append(style['font-weight'])
163
+ return emphasis
164
+
165
+ def google_fixed_width_font(style):
166
+ """check if the css of the current element defines a fixed width font"""
167
+ font_family = ''
168
+ if 'font-family' in style:
169
+ font_family = style['font-family']
170
+ if 'Courier New' == font_family or 'Consolas' == font_family:
171
+ return True
172
+ return False
173
+
174
+ def list_numbering_start(attrs):
175
+ """extract numbering from list element attributes"""
176
+ if 'start' in attrs:
177
+ return int(attrs['start']) - 1
178
+ else:
179
+ return 0
180
+
181
+ class HTML2Text(HTMLParser.HTMLParser):
182
+ def __init__(self, out=None, baseurl=''):
183
+ HTMLParser.HTMLParser.__init__(self)
184
+
185
+ # Config options
186
+ self.unicode_snob = UNICODE_SNOB
187
+ self.escape_snob = ESCAPE_SNOB
188
+ self.links_each_paragraph = LINKS_EACH_PARAGRAPH
189
+ self.body_width = BODY_WIDTH
190
+ self.skip_internal_links = SKIP_INTERNAL_LINKS
191
+ self.inline_links = INLINE_LINKS
192
+ self.google_list_indent = GOOGLE_LIST_INDENT
193
+ self.ignore_links = IGNORE_ANCHORS
194
+ self.ignore_images = IGNORE_IMAGES
195
+ self.ignore_emphasis = IGNORE_EMPHASIS
196
+ self.google_doc = False
197
+ self.ul_item_mark = '*'
198
+ self.emphasis_mark = '_'
199
+ self.strong_mark = '**'
200
+
201
+ if out is None:
202
+ self.out = self.outtextf
203
+ else:
204
+ self.out = out
205
+
206
+ self.outtextlist = [] # empty list to store output characters before they are "joined"
207
+
208
+ try:
209
+ self.outtext = unicode()
210
+ except NameError: # Python3
211
+ self.outtext = str()
212
+
213
+ self.quiet = 0
214
+ self.p_p = 0 # number of newline character to print before next output
215
+ self.outcount = 0
216
+ self.start = 1
217
+ self.space = 0
218
+ self.a = []
219
+ self.astack = []
220
+ self.maybe_automatic_link = None
221
+ self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
222
+ self.acount = 0
223
+ self.list = []
224
+ self.blockquote = 0
225
+ self.pre = 0
226
+ self.startpre = 0
227
+ self.code = False
228
+ self.br_toggle = ''
229
+ self.lastWasNL = 0
230
+ self.lastWasList = False
231
+ self.style = 0
232
+ self.style_def = {}
233
+ self.tag_stack = []
234
+ self.emphasis = 0
235
+ self.drop_white_space = 0
236
+ self.inheader = False
237
+ self.abbr_title = None # current abbreviation definition
238
+ self.abbr_data = None # last inner HTML (for abbr being defined)
239
+ self.abbr_list = {} # stack of abbreviations to write later
240
+ self.baseurl = baseurl
241
+
242
+ try: del unifiable_n[name2cp('nbsp')]
243
+ except KeyError: pass
244
+ unifiable['nbsp'] = '&nbsp_place_holder;'
245
+
246
+
247
+ def feed(self, data):
248
+ data = data.replace("</' + 'script>", "</ignore>")
249
+ HTMLParser.HTMLParser.feed(self, data)
250
+
251
+ def handle(self, data):
252
+ self.feed(data)
253
+ self.feed("")
254
+ return self.optwrap(self.close())
255
+
256
+ def outtextf(self, s):
257
+ self.outtextlist.append(s)
258
+ if s: self.lastWasNL = s[-1] == '\n'
259
+
260
+ def close(self):
261
+ HTMLParser.HTMLParser.close(self)
262
+
263
+ self.pbr()
264
+ self.o('', 0, 'end')
265
+
266
+ self.outtext = self.outtext.join(self.outtextlist)
267
+ if self.unicode_snob:
268
+ nbsp = unichr(name2cp('nbsp'))
269
+ else:
270
+ nbsp = u' '
271
+ self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)
272
+
273
+ return self.outtext
274
+
275
+ def handle_charref(self, c):
276
+ self.o(self.charref(c), 1)
277
+
278
+ def handle_entityref(self, c):
279
+ self.o(self.entityref(c), 1)
280
+
281
+ def handle_starttag(self, tag, attrs):
282
+ self.handle_tag(tag, attrs, 1)
283
+
284
+ def handle_endtag(self, tag):
285
+ self.handle_tag(tag, None, 0)
286
+
287
+ def previousIndex(self, attrs):
288
+ """ returns the index of certain set of attributes (of a link) in the
289
+ self.a list
290
+
291
+ If the set of attributes is not found, returns None
292
+ """
293
+ if not has_key(attrs, 'href'): return None
294
+
295
+ i = -1
296
+ for a in self.a:
297
+ i += 1
298
+ match = 0
299
+
300
+ if has_key(a, 'href') and a['href'] == attrs['href']:
301
+ if has_key(a, 'title') or has_key(attrs, 'title'):
302
+ if (has_key(a, 'title') and has_key(attrs, 'title') and
303
+ a['title'] == attrs['title']):
304
+ match = True
305
+ else:
306
+ match = True
307
+
308
+ if match: return i
309
+
310
+ def drop_last(self, nLetters):
311
+ if not self.quiet:
312
+ self.outtext = self.outtext[:-nLetters]
313
+
314
+ def handle_emphasis(self, start, tag_style, parent_style):
315
+ """handles various text emphases"""
316
+ tag_emphasis = google_text_emphasis(tag_style)
317
+ parent_emphasis = google_text_emphasis(parent_style)
318
+
319
+ # handle Google's text emphasis
320
+ strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough
321
+ bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
322
+ italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
323
+ fixed = google_fixed_width_font(tag_style) and not \
324
+ google_fixed_width_font(parent_style) and not self.pre
325
+
326
+ if start:
327
+ # crossed-out text must be handled before other attributes
328
+ # in order not to output qualifiers unnecessarily
329
+ if bold or italic or fixed:
330
+ self.emphasis += 1
331
+ if strikethrough:
332
+ self.quiet += 1
333
+ if italic:
334
+ self.o(self.emphasis_mark)
335
+ self.drop_white_space += 1
336
+ if bold:
337
+ self.o(self.strong_mark)
338
+ self.drop_white_space += 1
339
+ if fixed:
340
+ self.o('`')
341
+ self.drop_white_space += 1
342
+ self.code = True
343
+ else:
344
+ if bold or italic or fixed:
345
+ # there must not be whitespace before closing emphasis mark
346
+ self.emphasis -= 1
347
+ self.space = 0
348
+ self.outtext = self.outtext.rstrip()
349
+ if fixed:
350
+ if self.drop_white_space:
351
+ # empty emphasis, drop it
352
+ self.drop_last(1)
353
+ self.drop_white_space -= 1
354
+ else:
355
+ self.o('`')
356
+ self.code = False
357
+ if bold:
358
+ if self.drop_white_space:
359
+ # empty emphasis, drop it
360
+ self.drop_last(2)
361
+ self.drop_white_space -= 1
362
+ else:
363
+ self.o(self.strong_mark)
364
+ if italic:
365
+ if self.drop_white_space:
366
+ # empty emphasis, drop it
367
+ self.drop_last(1)
368
+ self.drop_white_space -= 1
369
+ else:
370
+ self.o(self.emphasis_mark)
371
+ # space is only allowed after *all* emphasis marks
372
+ if (bold or italic) and not self.emphasis:
373
+ self.o(" ")
374
+ if strikethrough:
375
+ self.quiet -= 1
376
+
377
+ def handle_tag(self, tag, attrs, start):
378
+ #attrs = fixattrs(attrs)
379
+ if attrs is None:
380
+ attrs = {}
381
+ else:
382
+ attrs = dict(attrs)
383
+
384
+ if self.google_doc:
385
+ # the attrs parameter is empty for a closing tag. in addition, we
386
+ # need the attributes of the parent nodes in order to get a
387
+ # complete style description for the current element. we assume
388
+ # that google docs export well formed html.
389
+ parent_style = {}
390
+ if start:
391
+ if self.tag_stack:
392
+ parent_style = self.tag_stack[-1][2]
393
+ tag_style = element_style(attrs, self.style_def, parent_style)
394
+ self.tag_stack.append((tag, attrs, tag_style))
395
+ else:
396
+ dummy, attrs, tag_style = self.tag_stack.pop()
397
+ if self.tag_stack:
398
+ parent_style = self.tag_stack[-1][2]
399
+
400
+ if hn(tag):
401
+ self.p()
402
+ if start:
403
+ self.inheader = True
404
+ self.o(hn(tag)*"#" + ' ')
405
+ else:
406
+ self.inheader = False
407
+ return # prevent redundant emphasis marks on headers
408
+
409
+ if tag in ['p', 'div']:
410
+ if self.google_doc:
411
+ if start and google_has_height(tag_style):
412
+ self.p()
413
+ else:
414
+ self.soft_br()
415
+ else:
416
+ self.p()
417
+
418
+ if tag == "br" and start: self.o(" \n")
419
+
420
+ if tag == "hr" and start:
421
+ self.p()
422
+ self.o("* * *")
423
+ self.p()
424
+
425
+ if tag in ["head", "style", 'script']:
426
+ if start: self.quiet += 1
427
+ else: self.quiet -= 1
428
+
429
+ if tag == "style":
430
+ if start: self.style += 1
431
+ else: self.style -= 1
432
+
433
+ if tag in ["body"]:
434
+ self.quiet = 0 # sites like 9rules.com never close <head>
435
+
436
+ if tag == "blockquote":
437
+ if start:
438
+ self.p(); self.o('> ', 0, 1); self.start = 1
439
+ self.blockquote += 1
440
+ else:
441
+ self.blockquote -= 1
442
+ self.p()
443
+
444
+ if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
445
+ if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
446
+ if tag in ['del', 'strike', 's']:
447
+ if start:
448
+ self.o("<"+tag+">")
449
+ else:
450
+ self.o("</"+tag+">")
451
+
452
+ if self.google_doc:
453
+ if not self.inheader:
454
+ # handle some font attributes, but leave headers clean
455
+ self.handle_emphasis(start, tag_style, parent_style)
456
+
457
+ if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
458
+ if tag == "abbr":
459
+ if start:
460
+ self.abbr_title = None
461
+ self.abbr_data = ''
462
+ if has_key(attrs, 'title'):
463
+ self.abbr_title = attrs['title']
464
+ else:
465
+ if self.abbr_title != None:
466
+ self.abbr_list[self.abbr_data] = self.abbr_title
467
+ self.abbr_title = None
468
+ self.abbr_data = ''
469
+
470
+ if tag == "a" and not self.ignore_links:
471
+ if start:
472
+ if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
473
+ self.astack.append(attrs)
474
+ self.maybe_automatic_link = attrs['href']
475
+ else:
476
+ self.astack.append(None)
477
+ else:
478
+ if self.astack:
479
+ a = self.astack.pop()
480
+ if self.maybe_automatic_link:
481
+ self.maybe_automatic_link = None
482
+ elif a:
483
+ if self.inline_links:
484
+ self.o("](" + escape_md(a['href']) + ")")
485
+ else:
486
+ i = self.previousIndex(a)
487
+ if i is not None:
488
+ a = self.a[i]
489
+ else:
490
+ self.acount += 1
491
+ a['count'] = self.acount
492
+ a['outcount'] = self.outcount
493
+ self.a.append(a)
494
+ self.o("][" + str(a['count']) + "]")
495
+
496
+ if tag == "img" and start and not self.ignore_images:
497
+ if has_key(attrs, 'src'):
498
+ attrs['href'] = attrs['src']
499
+ alt = attrs.get('alt', '')
500
+ self.o("![" + escape_md(alt) + "]")
501
+
502
+ if self.inline_links:
503
+ self.o("(" + escape_md(attrs['href']) + ")")
504
+ else:
505
+ i = self.previousIndex(attrs)
506
+ if i is not None:
507
+ attrs = self.a[i]
508
+ else:
509
+ self.acount += 1
510
+ attrs['count'] = self.acount
511
+ attrs['outcount'] = self.outcount
512
+ self.a.append(attrs)
513
+ self.o("[" + str(attrs['count']) + "]")
514
+
515
+ if tag == 'dl' and start: self.p()
516
+ if tag == 'dt' and not start: self.pbr()
517
+ if tag == 'dd' and start: self.o(' ')
518
+ if tag == 'dd' and not start: self.pbr()
519
+
520
+ if tag in ["ol", "ul"]:
521
+ # Google Docs create sub lists as top level lists
522
+ if (not self.list) and (not self.lastWasList):
523
+ self.p()
524
+ if start:
525
+ if self.google_doc:
526
+ list_style = google_list_style(tag_style)
527
+ else:
528
+ list_style = tag
529
+ numbering_start = list_numbering_start(attrs)
530
+ self.list.append({'name':list_style, 'num':numbering_start})
531
+ else:
532
+ if self.list: self.list.pop()
533
+ self.lastWasList = True
534
+ else:
535
+ self.lastWasList = False
536
+
537
+ if tag == 'li':
538
+ self.pbr()
539
+ if start:
540
+ if self.list: li = self.list[-1]
541
+ else: li = {'name':'ul', 'num':0}
542
+ if self.google_doc:
543
+ nest_count = self.google_nest_count(tag_style)
544
+ else:
545
+ nest_count = len(self.list)
546
+ self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
547
+ if li['name'] == "ul": self.o(self.ul_item_mark + " ")
548
+ elif li['name'] == "ol":
549
+ li['num'] += 1
550
+ self.o(str(li['num'])+". ")
551
+ self.start = 1
552
+
553
+ if tag in ["table", "tr"] and start: self.p()
554
+ if tag == 'td': self.pbr()
555
+
556
+ if tag == "pre":
557
+ if start:
558
+ self.startpre = 1
559
+ self.pre = 1
560
+ else:
561
+ self.pre = 0
562
+ self.p()
563
+
564
+ def pbr(self):
565
+ if self.p_p == 0:
566
+ self.p_p = 1
567
+
568
+ def p(self):
569
+ self.p_p = 2
570
+
571
+ def soft_br(self):
572
+ self.pbr()
573
+ self.br_toggle = ' '
574
+
575
+ def o(self, data, puredata=0, force=0):
576
+ if self.abbr_data is not None:
577
+ self.abbr_data += data
578
+
579
+ if not self.quiet:
580
+ if self.google_doc:
581
+ # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
582
+ lstripped_data = data.lstrip()
583
+ if self.drop_white_space and not (self.pre or self.code):
584
+ data = lstripped_data
585
+ if lstripped_data != '':
586
+ self.drop_white_space = 0
587
+
588
+ if puredata and not self.pre:
589
+ data = re.sub('\s+', ' ', data)
590
+ if data and data[0] == ' ':
591
+ self.space = 1
592
+ data = data[1:]
593
+ if not data and not force: return
594
+
595
+ if self.startpre:
596
+ #self.out(" :") #TODO: not output when already one there
597
+ if not data.startswith("\n"): # <pre>stuff...
598
+ data = "\n" + data
599
+
600
+ bq = (">" * self.blockquote)
601
+ if not (force and data and data[0] == ">") and self.blockquote: bq += " "
602
+
603
+ if self.pre:
604
+ if not self.list:
605
+ bq += " "
606
+ #else: list content is already partially indented
607
+ for i in xrange(len(self.list)):
608
+ bq += " "
609
+ data = data.replace("\n", "\n"+bq)
610
+
611
+ if self.startpre:
612
+ self.startpre = 0
613
+ if self.list:
614
+ data = data.lstrip("\n") # use existing initial indentation
615
+
616
+ if self.start:
617
+ self.space = 0
618
+ self.p_p = 0
619
+ self.start = 0
620
+
621
+ if force == 'end':
622
+ # It's the end.
623
+ self.p_p = 0
624
+ self.out("\n")
625
+ self.space = 0
626
+
627
+ if self.p_p:
628
+ self.out((self.br_toggle+'\n'+bq)*self.p_p)
629
+ self.space = 0
630
+ self.br_toggle = ''
631
+
632
+ if self.space:
633
+ if not self.lastWasNL: self.out(' ')
634
+ self.space = 0
635
+
636
+ if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
637
+ if force == "end": self.out("\n")
638
+
639
+ newa = []
640
+ for link in self.a:
641
+ if self.outcount > link['outcount']:
642
+ self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
643
+ if has_key(link, 'title'): self.out(" ("+link['title']+")")
644
+ self.out("\n")
645
+ else:
646
+ newa.append(link)
647
+
648
+ if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
649
+
650
+ self.a = newa
651
+
652
+ if self.abbr_list and force == "end":
653
+ for abbr, definition in self.abbr_list.items():
654
+ self.out(" *[" + abbr + "]: " + definition + "\n")
655
+
656
+ self.p_p = 0
657
+ self.out(data)
658
+ self.outcount += 1
659
+
660
+ def handle_data(self, data):
661
+ if r'\/script>' in data: self.quiet -= 1
662
+
663
+ if self.style:
664
+ self.style_def.update(dumb_css_parser(data))
665
+
666
+ if not self.maybe_automatic_link is None:
667
+ href = self.maybe_automatic_link
668
+ if href == data and self.absolute_url_matcher.match(href):
669
+ self.o("<" + data + ">")
670
+ return
671
+ else:
672
+ self.o("[")
673
+ self.maybe_automatic_link = None
674
+
675
+ if not self.code and not self.pre:
676
+ data = escape_md_section(data, snob=self.escape_snob)
677
+ self.o(data, 1)
678
+
679
+ def unknown_decl(self, data): pass
680
+
681
+ def charref(self, name):
682
+ if name[0] in ['x','X']:
683
+ c = int(name[1:], 16)
684
+ else:
685
+ c = int(name)
686
+
687
+ if not self.unicode_snob and c in unifiable_n.keys():
688
+ return unifiable_n[c]
689
+ else:
690
+ try:
691
+ return unichr(c)
692
+ except NameError: #Python3
693
+ return chr(c)
694
+
695
+ def entityref(self, c):
696
+ if not self.unicode_snob and c in unifiable.keys():
697
+ return unifiable[c]
698
+ else:
699
+ try: name2cp(c)
700
+ except KeyError: return "&" + c + ';'
701
+ else:
702
+ try:
703
+ return unichr(name2cp(c))
704
+ except NameError: #Python3
705
+ return chr(name2cp(c))
706
+
707
+ def replaceEntities(self, s):
708
+ s = s.group(1)
709
+ if s[0] == "#":
710
+ return self.charref(s[1:])
711
+ else: return self.entityref(s)
712
+
713
+ r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
714
+ def unescape(self, s):
715
+ return self.r_unescape.sub(self.replaceEntities, s)
716
+
717
+ def google_nest_count(self, style):
718
+ """calculate the nesting count of google doc lists"""
719
+ nest_count = 0
720
+ if 'margin-left' in style:
721
+ nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
722
+ return nest_count
723
+
724
+
725
+ def optwrap(self, text):
726
+ """Wrap all paragraphs in the provided text."""
727
+ if not self.body_width:
728
+ return text
729
+
730
+ assert wrap, "Requires Python 2.3."
731
+ result = ''
732
+ newlines = 0
733
+ for para in text.split("\n"):
734
+ if len(para) > 0:
735
+ if not skipwrap(para):
736
+ result += "\n".join(wrap(para, self.body_width))
737
+ if para.endswith(' '):
738
+ result += " \n"
739
+ newlines = 1
740
+ else:
741
+ result += "\n\n"
742
+ newlines = 2
743
+ else:
744
+ if not onlywhite(para):
745
+ result += para + "\n"
746
+ newlines = 1
747
+ else:
748
+ if newlines < 2:
749
+ result += "\n"
750
+ newlines += 1
751
+ return result
752
+
753
+ ordered_list_matcher = re.compile(r'\d+\.\s')
754
+ unordered_list_matcher = re.compile(r'[-\*\+]\s')
755
+ md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
756
+ md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
757
+ md_dot_matcher = re.compile(r"""
758
+ ^ # start of line
759
+ (\s*\d+) # optional whitespace and a number
760
+ (\.) # dot
761
+ (?=\s) # lookahead assert whitespace
762
+ """, re.MULTILINE | re.VERBOSE)
763
+ md_plus_matcher = re.compile(r"""
764
+ ^
765
+ (\s*)
766
+ (\+)
767
+ (?=\s)
768
+ """, flags=re.MULTILINE | re.VERBOSE)
769
+ md_dash_matcher = re.compile(r"""
770
+ ^
771
+ (\s*)
772
+ (-)
773
+ (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
774
+ # or another dash (header or hr)
775
+ """, flags=re.MULTILINE | re.VERBOSE)
776
+ slash_chars = r'\`*_{}[]()#+-.!'
777
+ md_backslash_matcher = re.compile(r'''
778
+ (\\) # match one slash
779
+ (?=[%s]) # followed by a char that requires escaping
780
+ ''' % re.escape(slash_chars),
781
+ flags=re.VERBOSE)
782
+
783
+ def skipwrap(para):
784
+ # If the text begins with four spaces or one tab, it's a code block; don't wrap
785
+ if para[0:4] == ' ' or para[0] == '\t':
786
+ return True
787
+ # If the text begins with only two "--", possibly preceded by whitespace, that's
788
+ # an emdash; so wrap.
789
+ stripped = para.lstrip()
790
+ if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
791
+ return False
792
+ # I'm not sure what this is for; I thought it was to detect lists, but there's
793
+ # a <br>-inside-<span> case in one of the tests that also depends upon it.
794
+ if stripped[0:1] == '-' or stripped[0:1] == '*':
795
+ return True
796
+ # If the text begins with a single -, *, or +, followed by a space, or an integer,
797
+ # followed by a ., followed by a space (in either case optionally preceeded by
798
+ # whitespace), it's a list; don't wrap.
799
+ if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
800
+ return True
801
+ return False
802
+
803
+ def wrapwrite(text):
804
+ text = text.encode('utf-8')
805
+ try: #Python3
806
+ sys.stdout.buffer.write(text)
807
+ except AttributeError:
808
+ sys.stdout.write(text)
809
+
810
+ def html2text(html, baseurl=''):
811
+ h = HTML2Text(baseurl=baseurl)
812
+ return h.handle(html)
813
+
814
+ def unescape(s, unicode_snob=False):
815
+ h = HTML2Text()
816
+ h.unicode_snob = unicode_snob
817
+ return h.unescape(s)
818
+
819
+ def escape_md(text):
820
+ """Escapes markdown-sensitive characters within other markdown constructs."""
821
+ return md_chars_matcher.sub(r"\\\1", text)
822
+
823
+ def escape_md_section(text, snob=False):
824
+ """Escapes markdown-sensitive characters across whole document sections."""
825
+ text = md_backslash_matcher.sub(r"\\\1", text)
826
+ if snob:
827
+ text = md_chars_matcher_all.sub(r"\\\1", text)
828
+ text = md_dot_matcher.sub(r"\1\\\2", text)
829
+ text = md_plus_matcher.sub(r"\1\\\2", text)
830
+ text = md_dash_matcher.sub(r"\1\\\2", text)
831
+ return text
832
+
833
+
834
+ def main():
835
+ baseurl = ''
836
+
837
+ p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
838
+ version='%prog ' + __version__)
839
+ p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
840
+ default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
841
+ p.add_option("--ignore-links", dest="ignore_links", action="store_true",
842
+ default=IGNORE_ANCHORS, help="don't include any formatting for links")
843
+ p.add_option("--ignore-images", dest="ignore_images", action="store_true",
844
+ default=IGNORE_IMAGES, help="don't include any formatting for images")
845
+ p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
846
+ default=False, help="convert an html-exported Google Document")
847
+ p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
848
+ default=False, help="use a dash rather than a star for unordered list items")
849
+ p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
850
+ default=False, help="use an asterisk rather than an underscore for emphasized text")
851
+ p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
852
+ default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
853
+ p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
854
+ default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
855
+ p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
856
+ default=False, help="hide strike-through text. only relevant when -g is specified as well")
857
+ p.add_option("--escape-all", action="store_true", dest="escape_snob",
858
+ default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
859
+ (options, args) = p.parse_args()
860
+
861
+ # process input
862
+ encoding = "utf-8"
863
+ if len(args) > 0:
864
+ file_ = args[0]
865
+ if len(args) == 2:
866
+ encoding = args[1]
867
+ if len(args) > 2:
868
+ p.error('Too many arguments')
869
+
870
+ if file_.startswith('http://') or file_.startswith('https://'):
871
+ baseurl = file_
872
+ j = urllib.urlopen(baseurl)
873
+ data = j.read()
874
+ if encoding is None:
875
+ try:
876
+ from feedparser import _getCharacterEncoding as enc
877
+ except ImportError:
878
+ enc = lambda x, y: ('utf-8', 1)
879
+ encoding = enc(j.headers, data)[0]
880
+ if encoding == 'us-ascii':
881
+ encoding = 'utf-8'
882
+ else:
883
+ data = open(file_, 'rb').read()
884
+ if encoding is None:
885
+ try:
886
+ from chardet import detect
887
+ except ImportError:
888
+ detect = lambda x: {'encoding': 'utf-8'}
889
+ encoding = detect(data)['encoding']
890
+ else:
891
+ data = sys.stdin.read()
892
+
893
+ data = data.decode(encoding)
894
+ h = HTML2Text(baseurl=baseurl)
895
+ # handle options
896
+ if options.ul_style_dash: h.ul_item_mark = '-'
897
+ if options.em_style_asterisk:
898
+ h.emphasis_mark = '*'
899
+ h.strong_mark = '__'
900
+
901
+ h.body_width = options.body_width
902
+ h.list_indent = options.list_indent
903
+ h.ignore_emphasis = options.ignore_emphasis
904
+ h.ignore_links = options.ignore_links
905
+ h.ignore_images = options.ignore_images
906
+ h.google_doc = options.google_doc
907
+ h.hide_strikethrough = options.hide_strikethrough
908
+ h.escape_snob = options.escape_snob
909
+
910
+ wrapwrite(h.handle(data))
911
+
912
+
913
+ if __name__ == "__main__":
914
+ main()