ghostwriter 0.1.0.placeholder → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.rubocop.yml +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +1 -1
- data/CODE_OF_CONDUCT.md +6 -42
- data/Gemfile +3 -1
- data/LICENSE.txt +1 -1
- data/README.md +258 -11
- data/RELEASE_NOTES.md +77 -0
- data/Rakefile +5 -3
- data/bin/console +5 -3
- data/bin/setup +1 -2
- data/dirt-textify.gemspec +37 -0
- data/lib/ghostwriter.rb +4 -4
- data/lib/ghostwriter/version.rb +3 -1
- data/lib/ghostwriter/writer.rb +168 -0
- metadata +63 -17
- data/ghostwriter.gemspec +0 -25
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 80d5aced9b18684b3640c28ce1e86c8e9859942f57fefbd26dd1a1f3e7791eaf
|
4
|
+
data.tar.gz: 0f71619c0a7e247cf163f074ca7c8bb54fa8c93daaa5649df8f853fd0ccab1da
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f9760753d4ffc30bee200a33347cd9aeda0b4593304f07ff9ce53c4ca1f971d51b50644feb763ffc51ec5635202e971704592f45d0cbe021693a7e790e39c7e9
|
7
|
+
data.tar.gz: 203df0d639d25f35a73dcdde0f12c037e8f508e086c34cec5e2da7325a8b0e173db10590da50550a50e853f74f0753ce231b36646619fc3c1f166bdd986e7186
|
data/.rubocop.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
inherit_from: ../.rubocop.yml
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby-2.7.1
|
data/.travis.yml
CHANGED
data/CODE_OF_CONDUCT.md
CHANGED
@@ -1,49 +1,13 @@
|
|
1
1
|
# Contributor Code of Conduct
|
2
2
|
|
3
|
-
As contributors and maintainers of this project,
|
4
|
-
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
-
contribute through reporting issues, posting feature requests, updating
|
6
|
-
documentation, submitting pull requests or patches, and other activities.
|
3
|
+
As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
|
7
4
|
|
8
|
-
We are committed to making participation in this project a harassment-free
|
9
|
-
experience for everyone, regardless of level of experience, gender, gender
|
10
|
-
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
-
body size, race, ethnicity, age, religion, or nationality.
|
5
|
+
We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
|
12
6
|
|
13
|
-
Examples of unacceptable behavior by participants include
|
7
|
+
Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct.
|
14
8
|
|
15
|
-
|
16
|
-
* Personal attacks
|
17
|
-
* Trolling or insulting/derogatory comments
|
18
|
-
* Public or private harassment
|
19
|
-
* Publishing other's private information, such as physical or electronic
|
20
|
-
addresses, without explicit permission
|
21
|
-
* Other unethical or unprofessional conduct
|
9
|
+
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team.
|
22
10
|
|
23
|
-
|
24
|
-
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
-
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
-
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
-
threatening, offensive, or harmful.
|
11
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
|
28
12
|
|
29
|
-
|
30
|
-
fairly and consistently applying these principles to every aspect of managing
|
31
|
-
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
-
Conduct may be permanently removed from the project team.
|
33
|
-
|
34
|
-
This code of conduct applies both within project spaces and in public spaces
|
35
|
-
when an individual is representing the project or its community.
|
36
|
-
|
37
|
-
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
-
reported by contacting a project maintainer at robin@tenjin.ca. All
|
39
|
-
complaints will be reviewed and investigated and will result in a response that
|
40
|
-
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
-
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
-
incident.
|
43
|
-
|
44
|
-
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
-
version 1.3.0, available at
|
46
|
-
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
-
|
48
|
-
[homepage]: http://contributor-covenant.org
|
49
|
-
[version]: http://contributor-covenant.org/version/1/3/0/
|
13
|
+
This Code of Conduct is adapted from the [Contributor Covenant](http://contributor-covenant.org), version 1.0.0, available at [http://contributor-covenant.org/version/1/0/0/](http://contributor-covenant.org/version/1/0/0/)
|
data/Gemfile
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,15 @@
|
|
1
1
|
# Ghostwriter
|
2
2
|
|
3
|
-
|
3
|
+
Ghostwriter rewrites HTML as plain text while preserving as much legibility and functionality as possible.
|
4
4
|
|
5
|
-
|
5
|
+
It's sort of like a reverse-markdown or a very, very simple screen reader.
|
6
|
+
|
7
|
+
## But Why, Though?
|
8
|
+
|
9
|
+
* Some email clients won't or can’t handle HTML at all
|
10
|
+
* Some people explicitly choose plaintext just by preference or accessibility
|
11
|
+
* Spam filters tend to like emails with a plain text alternative (but if you use this gem to help you spam people, I
|
12
|
+
will yell at you)
|
6
13
|
|
7
14
|
## Installation
|
8
15
|
|
@@ -14,28 +21,268 @@ gem 'ghostwriter'
|
|
14
21
|
|
15
22
|
And then execute:
|
16
23
|
|
17
|
-
|
24
|
+
bundle
|
18
25
|
|
19
|
-
Or install it
|
26
|
+
Or install it manually with:
|
20
27
|
|
21
|
-
|
28
|
+
gem install ghostwriter
|
22
29
|
|
23
30
|
## Usage
|
24
31
|
|
25
|
-
|
32
|
+
Create a `Ghostwriter::Writer` and call `#textify` with the html you want modified:
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
html = '<html><body><p>This is some markup <a href="tenjin.ca">and a link</a></p><p>Other tags translate, too</p></body></html>'
|
36
|
+
|
37
|
+
Ghostwriter::Writer.new.textify(html)
|
38
|
+
```
|
39
|
+
|
40
|
+
Produces:
|
41
|
+
|
42
|
+
```
|
43
|
+
This is some markup and a link (tenjin.ca)
|
44
|
+
|
45
|
+
Other tags translate, too
|
46
|
+
```
|
47
|
+
|
48
|
+
### Links
|
49
|
+
|
50
|
+
Links are converted to the link text followed by the link target in brackets:
|
51
|
+
|
52
|
+
```html
|
53
|
+
|
54
|
+
<html>
|
55
|
+
<body>
|
56
|
+
Visit our <a href="https://example.com">Website</a>
|
57
|
+
<body>
|
58
|
+
</html>
|
59
|
+
```
|
60
|
+
|
61
|
+
Becomes:
|
62
|
+
|
63
|
+
```
|
64
|
+
Visit our Website (https://example.com)
|
65
|
+
```
|
66
|
+
|
67
|
+
#### Relative Links
|
68
|
+
|
69
|
+
Since emails are wholly distinct from your web address, relative links might break.
|
70
|
+
|
71
|
+
To avoid this problem, either use the `<base>` header tag:
|
72
|
+
|
73
|
+
```html
|
74
|
+
|
75
|
+
<html>
|
76
|
+
<head>
|
77
|
+
<base href="https://www.example.com">
|
78
|
+
</head>
|
79
|
+
<body>
|
80
|
+
Use the base tag to <a href="/contact">expand</a> links.
|
81
|
+
</body>
|
82
|
+
</html>
|
83
|
+
```
|
84
|
+
|
85
|
+
Becomes:
|
86
|
+
|
87
|
+
```
|
88
|
+
Use the base tag to expand (https://www.example.com/contact) links
|
89
|
+
```
|
90
|
+
|
91
|
+
Or you can use the `link_base` configuration:
|
92
|
+
|
93
|
+
```ruby
|
94
|
+
Ghostwriter::Writer.new(link_base: 'tenjin.ca').textify(html)
|
95
|
+
```
|
96
|
+
|
97
|
+
### Images
|
98
|
+
|
99
|
+
Images with alt text are converted:
|
100
|
+
|
101
|
+
```html
|
102
|
+
<img src="logo.jpg" alt="ACME Anvils" />
|
103
|
+
```
|
104
|
+
|
105
|
+
Becomes:
|
106
|
+
|
107
|
+
```
|
108
|
+
ACME Anvils (logo.jpg)
|
109
|
+
```
|
110
|
+
|
111
|
+
But images lacking alt text or with a presentation ARIA role are ignored:
|
112
|
+
|
113
|
+
```html
|
114
|
+
<!-- these will just become an empty string -->
|
115
|
+
<img src="decoration.jpg">
|
116
|
+
<img src="logo.jpg" role="presentation">
|
117
|
+
```
|
118
|
+
|
119
|
+
And images with data URIs won't include the data portion.
|
120
|
+
|
121
|
+
```html
|
122
|
+
<img src="" alt="Data picture"/>
|
123
|
+
```
|
124
|
+
|
125
|
+
Becomes:
|
126
|
+
|
127
|
+
```
|
128
|
+
Data picture (embedded)
|
129
|
+
```
|
130
|
+
|
131
|
+
|
132
|
+
### Lists
|
133
|
+
|
134
|
+
Lists are converted, too. They are padded with newlines and are given simple markers:
|
135
|
+
|
136
|
+
```html
|
137
|
+
|
138
|
+
<ul>
|
139
|
+
<li>Planes</li>
|
140
|
+
<li>Trains</li>
|
141
|
+
<li>Automobiles</li>
|
142
|
+
</ul>
|
143
|
+
<ol>
|
144
|
+
<li>I get knocked down</li>
|
145
|
+
<li>I get up again</li>
|
146
|
+
<li>Never gonna keep me down</li>
|
147
|
+
</ol>
|
148
|
+
```
|
149
|
+
|
150
|
+
Becomes:
|
151
|
+
|
152
|
+
```
|
153
|
+
|
154
|
+
- Planes
|
155
|
+
- Trains
|
156
|
+
- Automobiles
|
157
|
+
|
158
|
+
1. I get knocked down
|
159
|
+
2. I get up again
|
160
|
+
3. Never gonna keep me down
|
161
|
+
|
162
|
+
```
|
163
|
+
|
164
|
+
### Tables
|
165
|
+
|
166
|
+
Tables are still often used in email structuring because support for more modern HTML and CSS is inconsistent. If your
|
167
|
+
table is purely presentational, mark it with `role="presentation"`. See below for details.
|
168
|
+
|
169
|
+
For real data tables, Ghostwriter tries to maintain table structure for simple tables:
|
170
|
+
|
171
|
+
```html
|
172
|
+
|
173
|
+
<table>
|
174
|
+
<thead>
|
175
|
+
<tr>
|
176
|
+
<th>Ship</th>
|
177
|
+
<th>Captain</th>
|
178
|
+
</tr>
|
179
|
+
</thead>
|
180
|
+
<tbody>
|
181
|
+
<tr>
|
182
|
+
<td>Enterprise</td>
|
183
|
+
<td>Jean-Luc Picard</td>
|
184
|
+
</tr>
|
185
|
+
<tr>
|
186
|
+
<td>TARDIS</td>
|
187
|
+
<td>The Doctor</td>
|
188
|
+
</tr>
|
189
|
+
<tr>
|
190
|
+
<td>Planet Express Ship</td>
|
191
|
+
<td>Turanga Leela</td>
|
192
|
+
</tr>
|
193
|
+
</tbody>
|
194
|
+
</table>
|
195
|
+
```
|
196
|
+
|
197
|
+
Becomes:
|
198
|
+
|
199
|
+
```
|
200
|
+
| Ship | Captain |
|
201
|
+
|---------------------|-----------------|
|
202
|
+
| Enterprise | Jean-Luc Picard |
|
203
|
+
| TARDIS | The Doctor |
|
204
|
+
| Planet Express Ship | Turanga Leela |
|
205
|
+
```
|
26
206
|
|
27
|
-
|
207
|
+
### Presentation ARIA Role
|
28
208
|
|
29
|
-
|
209
|
+
Lists and tables with `role="presentation"` will be treated as a simple container and the normal behaviour will be
|
210
|
+
suppressed.
|
30
211
|
|
31
|
-
|
212
|
+
```html
|
213
|
+
|
214
|
+
<table role="presentation">
|
215
|
+
<tr>
|
216
|
+
<td>The table is a lie</td>
|
217
|
+
</tr>
|
218
|
+
</table>
|
219
|
+
<ul role="presentation">
|
220
|
+
<li>No such list</li>
|
221
|
+
</ul>
|
222
|
+
```
|
223
|
+
|
224
|
+
Becomes:
|
225
|
+
|
226
|
+
```
|
227
|
+
The table is a lie
|
228
|
+
No such list
|
229
|
+
```
|
230
|
+
|
231
|
+
### Mail Gem Example
|
232
|
+
|
233
|
+
To use `#textify` with the [mail](https://github.com/mikel/mail) gem, just provide the text-part by pasisng the html
|
234
|
+
through Ghostwriter:
|
235
|
+
|
236
|
+
```ruby
|
237
|
+
require 'mail'
|
238
|
+
|
239
|
+
html = 'My email and a <a href="https://tenjin.ca">link</a>'
|
240
|
+
ghostwriter = Ghostwriter::Writer.new
|
241
|
+
|
242
|
+
Mail.deliver do
|
243
|
+
to 'bob@example.com'
|
244
|
+
from 'dot@example.com'
|
245
|
+
subject 'Using Ghostwriter with Mail'
|
246
|
+
|
247
|
+
html_part do
|
248
|
+
content_type 'text/html; charset=UTF-8'
|
249
|
+
body html
|
250
|
+
end
|
251
|
+
|
252
|
+
text_part do
|
253
|
+
body ghostwriter.textify(html)
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
```
|
32
258
|
|
33
259
|
## Contributing
|
34
260
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
261
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/TenjinInc/ghostwriter
|
262
|
+
|
263
|
+
This project is intended to be a friendly space for collaboration, and contributors are expected to adhere to the
|
264
|
+
[Contributor Covenant](contributor-covenant.org) code of conduct.
|
265
|
+
|
266
|
+
### Core Developers
|
267
|
+
|
268
|
+
After checking out the repo, run `bundle install` to install dependencies. Then, run `rake spec` to run the tests. You
|
269
|
+
can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
36
270
|
|
271
|
+
#### Local Install
|
272
|
+
|
273
|
+
To install this gem onto your local machine only, run
|
274
|
+
|
275
|
+
`bundle exec rake install`
|
276
|
+
|
277
|
+
#### Gem Release
|
278
|
+
|
279
|
+
To release a gem to the world at large
|
280
|
+
|
281
|
+
1. Update the version number in `version.rb`,
|
282
|
+
2. Run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push
|
283
|
+
the `.gem` file to [rubygems.org](https://rubygems.org).
|
284
|
+
3. Do a wee dance
|
37
285
|
|
38
286
|
## License
|
39
287
|
|
40
288
|
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
41
|
-
|
data/RELEASE_NOTES.md
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
# Release Notes
|
2
|
+
|
3
|
+
## 1.0.0 (2021-03-21)
|
4
|
+
|
5
|
+
### Major
|
6
|
+
|
7
|
+
* Moved `link_base` parameter to constructor
|
8
|
+
* Moved input HTML parameter to `#textify`
|
9
|
+
|
10
|
+
### Minor
|
11
|
+
|
12
|
+
* Treats tables and lists with role="presentation" as simple containers
|
13
|
+
* Now handles ordered and unordered lists
|
14
|
+
* Images are now replaced with their alt text
|
15
|
+
|
16
|
+
### Bugfixes
|
17
|
+
|
18
|
+
* none
|
19
|
+
|
20
|
+
## 0.4.2 (2021-03-17)
|
21
|
+
|
22
|
+
### Major
|
23
|
+
|
24
|
+
* none
|
25
|
+
|
26
|
+
### Minor
|
27
|
+
|
28
|
+
* none
|
29
|
+
|
30
|
+
### Bugfixes
|
31
|
+
|
32
|
+
* Works with links using `tel:` and `mailto:` schemas.
|
33
|
+
|
34
|
+
## 0.4.1 (2021-03-17)
|
35
|
+
|
36
|
+
### Major
|
37
|
+
|
38
|
+
* none
|
39
|
+
|
40
|
+
### Minor
|
41
|
+
|
42
|
+
* No longer provides link target in brackets after link text when they are the same
|
43
|
+
|
44
|
+
### Bugfixes
|
45
|
+
|
46
|
+
* Added explicit testing for HTML entity interpretation
|
47
|
+
|
48
|
+
## 0.4.0 (2021-03-16)
|
49
|
+
|
50
|
+
### Major
|
51
|
+
|
52
|
+
* Updated gem dependencies
|
53
|
+
|
54
|
+
### Minor
|
55
|
+
|
56
|
+
* Updated docs
|
57
|
+
* Added support for tables
|
58
|
+
|
59
|
+
### Bugfixes
|
60
|
+
|
61
|
+
* none
|
62
|
+
|
63
|
+
## 0.3.0 (2016-03-06)
|
64
|
+
|
65
|
+
### Major
|
66
|
+
|
67
|
+
* Renamed to Ghostwriter
|
68
|
+
|
69
|
+
### Minor
|
70
|
+
|
71
|
+
* Docs: Added instruction for using textify with mail gem
|
72
|
+
|
73
|
+
### Bugfixes
|
74
|
+
|
75
|
+
* none
|
76
|
+
|
77
|
+
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# frozen_string_literal: true
|
2
4
|
|
3
|
-
require
|
4
|
-
require
|
5
|
+
require 'bundler/setup'
|
6
|
+
require 'ghostwriter'
|
5
7
|
|
6
8
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
9
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +12,5 @@ require "ghostwriter"
|
|
10
12
|
# require "pry"
|
11
13
|
# Pry.start
|
12
14
|
|
13
|
-
require
|
15
|
+
require 'irb'
|
14
16
|
IRB.start
|
data/bin/setup
CHANGED
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'ghostwriter/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'ghostwriter'
|
9
|
+
spec.version = Ghostwriter::VERSION
|
10
|
+
spec.authors = ['Robin Miller']
|
11
|
+
spec.email = ['robin@tenjin.ca']
|
12
|
+
|
13
|
+
spec.summary = 'Intelligently extracts plaintext from an HTML document.'
|
14
|
+
spec.description = <<~DESC
|
15
|
+
Transforms HTML into plaintext while preserving legibility and functionality.
|
16
|
+
DESC
|
17
|
+
spec.homepage = 'https://github.com/TenjinInc/ghostwriter'
|
18
|
+
spec.license = 'MIT'
|
19
|
+
|
20
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
21
|
+
f.match(%r{^(test|spec|features)/})
|
22
|
+
end
|
23
|
+
|
24
|
+
spec.bindir = 'exe'
|
25
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
26
|
+
spec.require_paths = ['lib']
|
27
|
+
|
28
|
+
spec.required_ruby_version = '~> 2.4'
|
29
|
+
|
30
|
+
spec.add_dependency 'nokogiri', '= 1.8.4'
|
31
|
+
|
32
|
+
spec.add_development_dependency 'bundler', '~> 2.2'
|
33
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
34
|
+
spec.add_development_dependency 'rspec', '~> 3.3'
|
35
|
+
spec.add_development_dependency 'rubocop', '~> 1.11'
|
36
|
+
spec.add_development_dependency 'rubocop-performance', '~> 1.10'
|
37
|
+
end
|
data/lib/ghostwriter.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
require 'ghostwriter/version'
|
4
|
+
require 'ghostwriter/writer'
|
5
|
+
require 'nokogiri'
|
data/lib/ghostwriter/version.rb
CHANGED
@@ -0,0 +1,168 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ghostwriter
|
4
|
+
# Main Ghostwriter converter object.
|
5
|
+
class Writer
|
6
|
+
# Creates a new ghostwriter
|
7
|
+
#
|
8
|
+
# @param [String] link_base the url to prefix relative links with
|
9
|
+
def initialize(link_base: '')
|
10
|
+
@link_base = link_base
|
11
|
+
@list_marker = '-'
|
12
|
+
end
|
13
|
+
|
14
|
+
# Strips HTML down to plain text.
|
15
|
+
#
|
16
|
+
# @param html [String] the HTML to be convert to text
|
17
|
+
#
|
18
|
+
# @return converted text
|
19
|
+
def textify(html)
|
20
|
+
doc = Nokogiri::HTML(normalize_whitespace(html).gsub('</p>', "</p>\n\n"))
|
21
|
+
|
22
|
+
doc.search('style, script').remove
|
23
|
+
|
24
|
+
replace_anchors(doc)
|
25
|
+
replace_images(doc)
|
26
|
+
|
27
|
+
simple_replace(doc, '*[role="presentation"]', "\n")
|
28
|
+
|
29
|
+
replace_headers(doc)
|
30
|
+
replace_lists(doc)
|
31
|
+
replace_tables(doc)
|
32
|
+
|
33
|
+
simple_replace(doc, 'hr', "\n----------\n")
|
34
|
+
simple_replace(doc, 'br', "\n")
|
35
|
+
|
36
|
+
# doc.search('p').each do |link_node|
|
37
|
+
# link_node.inner_html = link_node.inner_html + "\n\n"
|
38
|
+
# end
|
39
|
+
|
40
|
+
# trim, but only single-space character
|
41
|
+
doc.text.gsub(/^ +| +$/, '')
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def normalize_whitespace(html)
|
47
|
+
html.gsub(/\s/, ' ').squeeze(' ')
|
48
|
+
end
|
49
|
+
|
50
|
+
def replace_anchors(doc)
|
51
|
+
doc.search('a').each do |link_node|
|
52
|
+
href = get_link_target(link_node, get_link_base(doc))
|
53
|
+
|
54
|
+
link_node.inner_html = if link_matches(href, link_node.inner_html)
|
55
|
+
href.to_s
|
56
|
+
else
|
57
|
+
"#{ link_node.inner_html } (#{ href })"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def link_matches(first, second)
|
63
|
+
first.to_s.gsub(%r{^https?://}, '').chomp('/') == second.gsub(%r{^https?://}, '').chomp('/')
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_link_base(doc)
|
67
|
+
# <base> node is unique by W3C spec
|
68
|
+
base_node = doc.search('base').first
|
69
|
+
|
70
|
+
base_node ? base_node['href'] : @link_base
|
71
|
+
end
|
72
|
+
|
73
|
+
def get_link_target(link_node, base)
|
74
|
+
href = URI(link_node['href'])
|
75
|
+
if href.absolute?
|
76
|
+
href
|
77
|
+
else
|
78
|
+
base + href.to_s
|
79
|
+
end
|
80
|
+
rescue URI::InvalidURIError
|
81
|
+
link_node['href'].gsub(/^(tel|mailto):/, '').strip
|
82
|
+
end
|
83
|
+
|
84
|
+
def replace_headers(doc)
|
85
|
+
doc.search('header, h1, h2, h3, h4, h5, h6').each do |node|
|
86
|
+
node.inner_html = "-- #{ node.inner_html } --\n".squeeze(' ')
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def replace_images(doc)
|
91
|
+
doc.search('img[role=presentation]').remove
|
92
|
+
|
93
|
+
doc.search('img').each do |img_node|
|
94
|
+
src = img_node['src']
|
95
|
+
alt = img_node['alt']
|
96
|
+
|
97
|
+
src = 'embedded' if src.start_with? 'data:'
|
98
|
+
|
99
|
+
img_node.replace("#{ alt } (#{ src })") unless alt.nil? || alt.empty?
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def replace_lists(doc)
|
104
|
+
doc.search('ul, ol').each do |list_node|
|
105
|
+
list_node.search('./li').each_with_index do |list_item, i|
|
106
|
+
marker = if list_node.node_name == 'ol'
|
107
|
+
"#{ i + 1 }."
|
108
|
+
else
|
109
|
+
@list_marker
|
110
|
+
end
|
111
|
+
|
112
|
+
list_item.inner_html = "#{ marker } #{ list_item.inner_html }\n".squeeze(' ')
|
113
|
+
end
|
114
|
+
|
115
|
+
list_node.replace("\n#{ list_node.inner_html }\n")
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def replace_tables(doc)
|
120
|
+
doc.css('table').each do |table|
|
121
|
+
column_sizes = calculate_column_sizes(table)
|
122
|
+
|
123
|
+
table.search('./thead/tr', './tbody/tr', './tr').each do |row|
|
124
|
+
replace_table_nodes(row, column_sizes)
|
125
|
+
|
126
|
+
row.inner_html = "#{ row.inner_html }|\n"
|
127
|
+
end
|
128
|
+
|
129
|
+
add_table_header_underline(table, column_sizes)
|
130
|
+
|
131
|
+
table.inner_html = "#{ table.inner_html }\n"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def calculate_column_sizes(table)
|
136
|
+
column_sizes = table.search('tr').collect do |row|
|
137
|
+
row.search('th', 'td').collect do |node|
|
138
|
+
node.inner_html.length
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
column_sizes.transpose.collect(&:max)
|
143
|
+
end
|
144
|
+
|
145
|
+
def add_table_header_underline(table, column_sizes)
|
146
|
+
table.search('./thead').each do |row|
|
147
|
+
header_bottom = "|#{ column_sizes.collect { |len| ('-' * (len + 2)) }.join('|') }|"
|
148
|
+
|
149
|
+
row.inner_html = "#{ row.inner_html }#{ header_bottom }\n"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def replace_table_nodes(row, column_sizes)
|
154
|
+
row.search('th', 'td').each_with_index do |node, i|
|
155
|
+
new_content = "| #{ node.inner_html }".squeeze(' ')
|
156
|
+
|
157
|
+
# +2 for the extra spacing between text and pipe
|
158
|
+
node.inner_html = new_content.ljust(column_sizes[i] + 2)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def simple_replace(doc, tag, replacement)
|
163
|
+
doc.search(tag).each do |node|
|
164
|
+
node.replace(node.inner_html + replacement)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
metadata
CHANGED
@@ -1,59 +1,102 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ghostwriter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robin Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.8.4
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.8.4
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: bundler
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
16
30
|
requirements:
|
17
31
|
- - "~>"
|
18
32
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
33
|
+
version: '2.2'
|
20
34
|
type: :development
|
21
35
|
prerelease: false
|
22
36
|
version_requirements: !ruby/object:Gem::Requirement
|
23
37
|
requirements:
|
24
38
|
- - "~>"
|
25
39
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
40
|
+
version: '2.2'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rake
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
45
|
- - "~>"
|
32
46
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
47
|
+
version: '13.0'
|
34
48
|
type: :development
|
35
49
|
prerelease: false
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
37
51
|
requirements:
|
38
52
|
- - "~>"
|
39
53
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
54
|
+
version: '13.0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: '3.
|
61
|
+
version: '3.3'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.11'
|
48
76
|
type: :development
|
49
77
|
prerelease: false
|
50
78
|
version_requirements: !ruby/object:Gem::Requirement
|
51
79
|
requirements:
|
52
80
|
- - "~>"
|
53
81
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
|
56
|
-
|
82
|
+
version: '1.11'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop-performance
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.10'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.10'
|
97
|
+
description: 'Transforms HTML into plaintext while preserving legibility and functionality.
|
98
|
+
|
99
|
+
'
|
57
100
|
email:
|
58
101
|
- robin@tenjin.ca
|
59
102
|
executables: []
|
@@ -62,17 +105,21 @@ extra_rdoc_files: []
|
|
62
105
|
files:
|
63
106
|
- ".gitignore"
|
64
107
|
- ".rspec"
|
108
|
+
- ".rubocop.yml"
|
109
|
+
- ".ruby-version"
|
65
110
|
- ".travis.yml"
|
66
111
|
- CODE_OF_CONDUCT.md
|
67
112
|
- Gemfile
|
68
113
|
- LICENSE.txt
|
69
114
|
- README.md
|
115
|
+
- RELEASE_NOTES.md
|
70
116
|
- Rakefile
|
71
117
|
- bin/console
|
72
118
|
- bin/setup
|
73
|
-
-
|
119
|
+
- dirt-textify.gemspec
|
74
120
|
- lib/ghostwriter.rb
|
75
121
|
- lib/ghostwriter/version.rb
|
122
|
+
- lib/ghostwriter/writer.rb
|
76
123
|
homepage: https://github.com/TenjinInc/ghostwriter
|
77
124
|
licenses:
|
78
125
|
- MIT
|
@@ -83,17 +130,16 @@ require_paths:
|
|
83
130
|
- lib
|
84
131
|
required_ruby_version: !ruby/object:Gem::Requirement
|
85
132
|
requirements:
|
86
|
-
- - "
|
133
|
+
- - "~>"
|
87
134
|
- !ruby/object:Gem::Version
|
88
|
-
version: '
|
135
|
+
version: '2.4'
|
89
136
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
137
|
requirements:
|
91
|
-
- - "
|
138
|
+
- - ">="
|
92
139
|
- !ruby/object:Gem::Version
|
93
|
-
version:
|
140
|
+
version: '0'
|
94
141
|
requirements: []
|
95
|
-
|
96
|
-
rubygems_version: 2.4.6
|
142
|
+
rubygems_version: 3.1.2
|
97
143
|
signing_key:
|
98
144
|
specification_version: 4
|
99
145
|
summary: Intelligently extracts plaintext from an HTML document.
|
data/ghostwriter.gemspec
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'ghostwriter/version'
|
5
|
-
|
6
|
-
Gem::Specification.new do |spec|
|
7
|
-
spec.name = "ghostwriter"
|
8
|
-
spec.version = Ghostwriter::VERSION
|
9
|
-
spec.authors = ["Robin Miller"]
|
10
|
-
spec.email = ["robin@tenjin.ca"]
|
11
|
-
|
12
|
-
spec.summary = %q{Intelligently extracts plaintext from an HTML document.}
|
13
|
-
spec.description = %q{Transforms HTML into plaintext while preserving legibility and functionality. Previously known as dirt-textify.}
|
14
|
-
spec.homepage = 'https://github.com/TenjinInc/ghostwriter'
|
15
|
-
spec.license = "MIT"
|
16
|
-
|
17
|
-
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir = "exe"
|
19
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = ["lib"]
|
21
|
-
|
22
|
-
spec.add_development_dependency "bundler", "~> 1.11"
|
23
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
24
|
-
spec.add_development_dependency "rspec", "~> 3.0"
|
25
|
-
end
|