headless_html_editor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/headless_html_editor.rb +194 -0
- metadata +79 -0
@@ -0,0 +1,194 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
# rubocop:disable LineLength, MethodLength
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'nokogiri'
|
7
|
+
rescue LoadError => le
|
8
|
+
$stderr.puts "LoadError: #{le.message}"
|
9
|
+
$stderr.puts 'Run: gem install nokogiri'
|
10
|
+
exit
|
11
|
+
end
|
12
|
+
|
13
|
+
# Headless HTML Editor. Edit HTML files programmatically.
|
14
|
+
class HeadlessHtmlEditor
|
15
|
+
attr_reader :dom
|
16
|
+
|
17
|
+
# Create a new Headless HTML Editor.
|
18
|
+
def initialize(input_file_name, input_encoding = 'utf-8')
|
19
|
+
@input_file_name = input_file_name
|
20
|
+
if File.file?(input_file_name) && File.fnmatch?('**.html', input_file_name)
|
21
|
+
# read html file
|
22
|
+
puts "R: #{input_file_name}"
|
23
|
+
@dom = Nokogiri::HTML(
|
24
|
+
open(input_file_name, "r:#{input_encoding}", universal_newline: false)
|
25
|
+
)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
UNWANTED_CLASSES = %w{MsoNormal MsoBodyText NormalBold MsoTitle MsoHeader Templatehelp
|
30
|
+
TOCEntry Indent1 MsoCaption MsoListParagraph
|
31
|
+
MsoNormalTable MsoTableGrid MsoTableClassic1
|
32
|
+
MsoListParagraphCxSpFirst MsoListParagraphCxSpMiddle MsoListParagraphCxSpLast
|
33
|
+
MsoCommentText msocomtxt msocomoff MsoEndnoteText MsoFootnoteText}
|
34
|
+
|
35
|
+
# Cleanup after MS Word.
|
36
|
+
def remove_word_artifacts(options = { rebuild_toc: true })
|
37
|
+
@dom.css('meta[name="Generator"]').remove
|
38
|
+
# Remove abandoned anchors, that are not linked to.
|
39
|
+
@dom.css('a[name]').each do |a|
|
40
|
+
if @dom.css('a[href="#' + a['name'] + '"]').size == 0
|
41
|
+
puts "<a name=\"#{a['name']}\"> was removed, because it had no links to it."
|
42
|
+
a.replace(a.inner_html)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
# Clean up h1-h6 tags
|
46
|
+
headings = @dom.css('h1, h2, h3, h4, h5, h6')
|
47
|
+
headings.each do |heading|
|
48
|
+
a = heading.at_css('a[name]')
|
49
|
+
if a
|
50
|
+
heading['id'] = a['name'].sub(/_Toc/, 'Toc')
|
51
|
+
a.replace(a.inner_html)
|
52
|
+
end
|
53
|
+
heading.inner_html = heading.inner_html.sub(/\A(\s*\d+\.?)+\uC2A0*/, '').strip
|
54
|
+
end
|
55
|
+
# Remove Words "normal" classes.
|
56
|
+
UNWANTED_CLASSES.each do |class_name|
|
57
|
+
@dom.css(".#{class_name}").each do |node|
|
58
|
+
node.remove_attribute('class')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
# Remove unwanted section tags
|
62
|
+
@dom.css('.WordSection1, .WordSection2, .WordSection3, .WordSection4, .WordSection5, .WordSection6, .WordSection7, .WordSection8').each do |section|
|
63
|
+
puts "Removing #{section.name}.#{section['class']}"
|
64
|
+
section.replace(section.inner_html)
|
65
|
+
end
|
66
|
+
if options[:rebuild_toc]
|
67
|
+
# Remove page numbers from TOC
|
68
|
+
@dom.css('.MsoToc1 a, .MsoToc2 a, .MsoToc3 a, .MsoToc4 a').each do |item|
|
69
|
+
item.inner_html = item.inner_text.sub(/\A(\d+\.)+/, '').sub(/(\s+\d+)\Z/, '').strip
|
70
|
+
end
|
71
|
+
# Rewrite Toc as ordered list.
|
72
|
+
toc_item = @dom.at_css('.MsoToc1')
|
73
|
+
previous_toc_level = 0
|
74
|
+
new_toc = []
|
75
|
+
while toc_item
|
76
|
+
toc_item.inner_html = toc_item.inner_html.sub(/\n/, ' ')
|
77
|
+
class_attr = toc_item.attr('class')
|
78
|
+
current_toc_level = class_attr[6].to_i
|
79
|
+
new_toc << "</li>\n" if previous_toc_level == current_toc_level
|
80
|
+
new_toc << "</ol>\n</li>\n" if previous_toc_level > current_toc_level
|
81
|
+
new_toc << "\n<ol#{' id="toc"' if previous_toc_level == 0}>\n" if previous_toc_level < current_toc_level
|
82
|
+
link = toc_item.at_css('a')
|
83
|
+
if link.nil?
|
84
|
+
puts toc_item.to_s
|
85
|
+
else
|
86
|
+
toc_item.at_css('a').inner_html = link.inner_html.sub(/\A(\s*\d+)/, '').strip
|
87
|
+
new_toc << "<li>#{toc_item.inner_html.sub(/#_Toc/, '#Toc')}"
|
88
|
+
end
|
89
|
+
previous_toc_level = current_toc_level
|
90
|
+
begin
|
91
|
+
toc_item = toc_item.next_element
|
92
|
+
end while toc_item && toc_item.text?
|
93
|
+
toc_item = nil unless toc_item && toc_item.attr('class') && toc_item.attr('class').start_with?('MsoToc')
|
94
|
+
end
|
95
|
+
@dom.at_css('.MsoToc1').replace(new_toc.join('')) if @dom.at_css('.MsoToc1')
|
96
|
+
# Remove old Table of Contents
|
97
|
+
@dom.css('.MsoToc1, .MsoToc2, .MsoToc3, .MsoToc4').each { |item| item.remove }
|
98
|
+
end
|
99
|
+
# Remove empty paragraphs
|
100
|
+
@dom.css('p').each do |p|
|
101
|
+
if p.content.gsub("\uC2A0", '').strip.size == 0 && !p.at_css('img')
|
102
|
+
puts 'Removing empty paragraph.'
|
103
|
+
p.remove
|
104
|
+
end
|
105
|
+
end
|
106
|
+
@dom.css('table + br').remove
|
107
|
+
# /<!--\[if[.\n\r]+\[endif\]\s*-->/
|
108
|
+
end
|
109
|
+
|
110
|
+
# Remove script tags from the header
|
111
|
+
def remove_header_scripts
|
112
|
+
@dom.css('script').remove
|
113
|
+
end
|
114
|
+
|
115
|
+
# Change tracking in MS Word, adds a lot of ins and del tags. These tags are removed.
|
116
|
+
def accept_word_changes_tracked
|
117
|
+
@dom.css('del').remove
|
118
|
+
@dom.css('ins').each do |ins|
|
119
|
+
ins.replace ins.inner_html
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Change h1 to h2 and so on. h6 is not changed, so its a potential mess.
|
124
|
+
def demote_headings
|
125
|
+
@dom.css('h1, h2, h3, h4, h5').each do |heading|
|
126
|
+
heading.name = "h#{heading.name[1].to_i + 1}"
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Save the file with the same file name.
|
131
|
+
def save!(output_encoding = 'utf-8')
|
132
|
+
save_as!(@input_file_name, output_encoding)
|
133
|
+
end
|
134
|
+
|
135
|
+
# Save file with a new file name.
|
136
|
+
def save_as!(output_file_name, output_encoding = 'utf-8')
|
137
|
+
puts "W: #{output_file_name}"
|
138
|
+
begin
|
139
|
+
if File.writable?(output_file_name) || !File.exists?(output_file_name)
|
140
|
+
File.open(output_file_name, "w:#{output_encoding}", universal_newline: false) do |f|
|
141
|
+
f.write @dom.to_html({ encoding: output_encoding, indent: 2 })
|
142
|
+
end
|
143
|
+
else
|
144
|
+
$stderr.puts 'Failed: Read only!'
|
145
|
+
end
|
146
|
+
rescue StandardError => se
|
147
|
+
$stderr.puts "\nFailed!\n#{se.message}"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Edit all HTML files in a folder.
|
152
|
+
def self.edit_folder(folder, &block)
|
153
|
+
Dir.open(folder.gsub(/\\/, '/')) do |d|
|
154
|
+
d.each do |file_name|
|
155
|
+
file_name = File.join(d.path, file_name)
|
156
|
+
if File.file? file_name
|
157
|
+
editor = new(file_name)
|
158
|
+
unless editor.dom.nil?
|
159
|
+
yield editor
|
160
|
+
editor.save!
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Edit files listed in a text file. File names are absolute.
|
168
|
+
# If the first character on a line is # the line is ignored.
|
169
|
+
def self.bulk_edit(file_list_file_name, &block)
|
170
|
+
txt_file_name = File.expand_path(file_list_file_name)
|
171
|
+
File.readlines(txt_file_name).each do |file_name|
|
172
|
+
unless file_name.start_with? '#'
|
173
|
+
# Strip added to remove trailing newline characters.
|
174
|
+
file_name.strip!
|
175
|
+
if File.file? file_name
|
176
|
+
editor = new(file_name)
|
177
|
+
if editor.dom.nil?
|
178
|
+
puts "No DOM found in #{file_name}."
|
179
|
+
else
|
180
|
+
yield editor
|
181
|
+
editor.save!
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
|
190
|
+
if __FILE__ == $PROGRAM_NAME
|
191
|
+
HeadlessHtmlEditor.edit_folder(File.expand_path(ARGV[0])) do |editor|
|
192
|
+
editor.dom.at_css('html').add_child '<!-- HeadlessHtmlEditor was here! -->'
|
193
|
+
end
|
194
|
+
end
|
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: headless_html_editor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Bo Frederiksen
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-10-03 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.6.0
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.6.0
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: Headless HTML Editor - edit HTML files, without a UI.
|
47
|
+
email:
|
48
|
+
- bofrede@bofrede.dk
|
49
|
+
executables: []
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- lib/headless_html_editor.rb
|
54
|
+
homepage: https://github.com/bofrede/headless_html_editor
|
55
|
+
licenses:
|
56
|
+
- MIT
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ! '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
69
|
+
requirements:
|
70
|
+
- - ! '>='
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '0'
|
73
|
+
requirements: []
|
74
|
+
rubyforge_project:
|
75
|
+
rubygems_version: 1.8.24
|
76
|
+
signing_key:
|
77
|
+
specification_version: 3
|
78
|
+
summary: Headless HTML Editor - edit HTML files, without a UI.
|
79
|
+
test_files: []
|