headless_html_editor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/headless_html_editor.rb +194 -0
  2. metadata +79 -0
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+ # rubocop:disable LineLength, MethodLength
4
+
5
+ begin
6
+ require 'nokogiri'
7
+ rescue LoadError => le
8
+ $stderr.puts "LoadError: #{le.message}"
9
+ $stderr.puts 'Run: gem install nokogiri'
10
+ exit
11
+ end
12
+
13
+ # Headless HTML Editor. Edit HTML files programmatically.
14
+ class HeadlessHtmlEditor
15
+ attr_reader :dom
16
+
17
+ # Create a new Headless HTML Editor.
18
+ def initialize(input_file_name, input_encoding = 'utf-8')
19
+ @input_file_name = input_file_name
20
+ if File.file?(input_file_name) && File.fnmatch?('**.html', input_file_name)
21
+ # read html file
22
+ puts "R: #{input_file_name}"
23
+ @dom = Nokogiri::HTML(
24
+ open(input_file_name, "r:#{input_encoding}", universal_newline: false)
25
+ )
26
+ end
27
+ end
28
+
29
+ UNWANTED_CLASSES = %w{MsoNormal MsoBodyText NormalBold MsoTitle MsoHeader Templatehelp
30
+ TOCEntry Indent1 MsoCaption MsoListParagraph
31
+ MsoNormalTable MsoTableGrid MsoTableClassic1
32
+ MsoListParagraphCxSpFirst MsoListParagraphCxSpMiddle MsoListParagraphCxSpLast
33
+ MsoCommentText msocomtxt msocomoff MsoEndnoteText MsoFootnoteText}
34
+
35
+ # Cleanup after MS Word.
36
+ def remove_word_artifacts(options = { rebuild_toc: true })
37
+ @dom.css('meta[name="Generator"]').remove
38
+ # Remove abandoned anchors, that are not linked to.
39
+ @dom.css('a[name]').each do |a|
40
+ if @dom.css('a[href="#' + a['name'] + '"]').size == 0
41
+ puts "<a name=\"#{a['name']}\"> was removed, because it had no links to it."
42
+ a.replace(a.inner_html)
43
+ end
44
+ end
45
+ # Clean up h1-h6 tags
46
+ headings = @dom.css('h1, h2, h3, h4, h5, h6')
47
+ headings.each do |heading|
48
+ a = heading.at_css('a[name]')
49
+ if a
50
+ heading['id'] = a['name'].sub(/_Toc/, 'Toc')
51
+ a.replace(a.inner_html)
52
+ end
53
+ heading.inner_html = heading.inner_html.sub(/\A(\s*\d+\.?)+\uC2A0*/, '').strip
54
+ end
55
+ # Remove Words "normal" classes.
56
+ UNWANTED_CLASSES.each do |class_name|
57
+ @dom.css(".#{class_name}").each do |node|
58
+ node.remove_attribute('class')
59
+ end
60
+ end
61
+ # Remove unwanted section tags
62
+ @dom.css('.WordSection1, .WordSection2, .WordSection3, .WordSection4, .WordSection5, .WordSection6, .WordSection7, .WordSection8').each do |section|
63
+ puts "Removing #{section.name}.#{section['class']}"
64
+ section.replace(section.inner_html)
65
+ end
66
+ if options[:rebuild_toc]
67
+ # Remove page numbers from TOC
68
+ @dom.css('.MsoToc1 a, .MsoToc2 a, .MsoToc3 a, .MsoToc4 a').each do |item|
69
+ item.inner_html = item.inner_text.sub(/\A(\d+\.)+/, '').sub(/(\s+\d+)\Z/, '').strip
70
+ end
71
+ # Rewrite Toc as ordered list.
72
+ toc_item = @dom.at_css('.MsoToc1')
73
+ previous_toc_level = 0
74
+ new_toc = []
75
+ while toc_item
76
+ toc_item.inner_html = toc_item.inner_html.sub(/\n/, ' ')
77
+ class_attr = toc_item.attr('class')
78
+ current_toc_level = class_attr[6].to_i
79
+ new_toc << "</li>\n" if previous_toc_level == current_toc_level
80
+ new_toc << "</ol>\n</li>\n" if previous_toc_level > current_toc_level
81
+ new_toc << "\n<ol#{' id="toc"' if previous_toc_level == 0}>\n" if previous_toc_level < current_toc_level
82
+ link = toc_item.at_css('a')
83
+ if link.nil?
84
+ puts toc_item.to_s
85
+ else
86
+ toc_item.at_css('a').inner_html = link.inner_html.sub(/\A(\s*\d+)/, '').strip
87
+ new_toc << "<li>#{toc_item.inner_html.sub(/#_Toc/, '#Toc')}"
88
+ end
89
+ previous_toc_level = current_toc_level
90
+ begin
91
+ toc_item = toc_item.next_element
92
+ end while toc_item && toc_item.text?
93
+ toc_item = nil unless toc_item && toc_item.attr('class') && toc_item.attr('class').start_with?('MsoToc')
94
+ end
95
+ @dom.at_css('.MsoToc1').replace(new_toc.join('')) if @dom.at_css('.MsoToc1')
96
+ # Remove old Table of Contents
97
+ @dom.css('.MsoToc1, .MsoToc2, .MsoToc3, .MsoToc4').each { |item| item.remove }
98
+ end
99
+ # Remove empty paragraphs
100
+ @dom.css('p').each do |p|
101
+ if p.content.gsub("\uC2A0", '').strip.size == 0 && !p.at_css('img')
102
+ puts 'Removing empty paragraph.'
103
+ p.remove
104
+ end
105
+ end
106
+ @dom.css('table + br').remove
107
+ # /<!--\[if[.\n\r]+\[endif\]\s*-->/
108
+ end
109
+
110
+ # Remove script tags from the header
111
+ def remove_header_scripts
112
+ @dom.css('script').remove
113
+ end
114
+
115
+ # Change tracking in MS Word, adds a lot of ins and del tags. These tags are removed.
116
+ def accept_word_changes_tracked
117
+ @dom.css('del').remove
118
+ @dom.css('ins').each do |ins|
119
+ ins.replace ins.inner_html
120
+ end
121
+ end
122
+
123
+ # Change h1 to h2 and so on. h6 is not changed, so its a potential mess.
124
+ def demote_headings
125
+ @dom.css('h1, h2, h3, h4, h5').each do |heading|
126
+ heading.name = "h#{heading.name[1].to_i + 1}"
127
+ end
128
+ end
129
+
130
+ # Save the file with the same file name.
131
+ def save!(output_encoding = 'utf-8')
132
+ save_as!(@input_file_name, output_encoding)
133
+ end
134
+
135
+ # Save file with a new file name.
136
+ def save_as!(output_file_name, output_encoding = 'utf-8')
137
+ puts "W: #{output_file_name}"
138
+ begin
139
+ if File.writable?(output_file_name) || !File.exists?(output_file_name)
140
+ File.open(output_file_name, "w:#{output_encoding}", universal_newline: false) do |f|
141
+ f.write @dom.to_html({ encoding: output_encoding, indent: 2 })
142
+ end
143
+ else
144
+ $stderr.puts 'Failed: Read only!'
145
+ end
146
+ rescue StandardError => se
147
+ $stderr.puts "\nFailed!\n#{se.message}"
148
+ end
149
+ end
150
+
151
+ # Edit all HTML files in a folder.
152
+ def self.edit_folder(folder, &block)
153
+ Dir.open(folder.gsub(/\\/, '/')) do |d|
154
+ d.each do |file_name|
155
+ file_name = File.join(d.path, file_name)
156
+ if File.file? file_name
157
+ editor = new(file_name)
158
+ unless editor.dom.nil?
159
+ yield editor
160
+ editor.save!
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
166
+
167
+ # Edit files listed in a text file. File names are absolute.
168
+ # If the first character on a line is # the line is ignored.
169
+ def self.bulk_edit(file_list_file_name, &block)
170
+ txt_file_name = File.expand_path(file_list_file_name)
171
+ File.readlines(txt_file_name).each do |file_name|
172
+ unless file_name.start_with? '#'
173
+ # Strip added to remove trailing newline characters.
174
+ file_name.strip!
175
+ if File.file? file_name
176
+ editor = new(file_name)
177
+ if editor.dom.nil?
178
+ puts "No DOM found in #{file_name}."
179
+ else
180
+ yield editor
181
+ editor.save!
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end
187
+
188
+ end
189
+
190
+ if __FILE__ == $PROGRAM_NAME
191
+ HeadlessHtmlEditor.edit_folder(File.expand_path(ARGV[0])) do |editor|
192
+ editor.dom.at_css('html').add_child '<!-- HeadlessHtmlEditor was here! -->'
193
+ end
194
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: headless_html_editor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Bo Frederiksen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-10-03 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.6.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.6.0
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: Headless HTML Editor - edit HTML files, without a UI.
47
+ email:
48
+ - bofrede@bofrede.dk
49
+ executables: []
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - lib/headless_html_editor.rb
54
+ homepage: https://github.com/bofrede/headless_html_editor
55
+ licenses:
56
+ - MIT
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ requirements: []
74
+ rubyforge_project:
75
+ rubygems_version: 1.8.24
76
+ signing_key:
77
+ specification_version: 3
78
+ summary: Headless HTML Editor - edit HTML files, without a UI.
79
+ test_files: []