headless_html_editor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/headless_html_editor.rb +194 -0
  2. metadata +79 -0
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+ # rubocop:disable LineLength, MethodLength
4
+
5
+ begin
6
+ require 'nokogiri'
7
+ rescue LoadError => le
8
+ $stderr.puts "LoadError: #{le.message}"
9
+ $stderr.puts 'Run: gem install nokogiri'
10
+ exit
11
+ end
12
+
13
+ # Headless HTML Editor. Edit HTML files programmatically.
14
+ class HeadlessHtmlEditor
15
+ attr_reader :dom
16
+
17
+ # Create a new Headless HTML Editor.
18
+ def initialize(input_file_name, input_encoding = 'utf-8')
19
+ @input_file_name = input_file_name
20
+ if File.file?(input_file_name) && File.fnmatch?('**.html', input_file_name)
21
+ # read html file
22
+ puts "R: #{input_file_name}"
23
+ @dom = Nokogiri::HTML(
24
+ open(input_file_name, "r:#{input_encoding}", universal_newline: false)
25
+ )
26
+ end
27
+ end
28
+
29
+ UNWANTED_CLASSES = %w{MsoNormal MsoBodyText NormalBold MsoTitle MsoHeader Templatehelp
30
+ TOCEntry Indent1 MsoCaption MsoListParagraph
31
+ MsoNormalTable MsoTableGrid MsoTableClassic1
32
+ MsoListParagraphCxSpFirst MsoListParagraphCxSpMiddle MsoListParagraphCxSpLast
33
+ MsoCommentText msocomtxt msocomoff MsoEndnoteText MsoFootnoteText}
34
+
35
+ # Cleanup after MS Word.
36
+ def remove_word_artifacts(options = { rebuild_toc: true })
37
+ @dom.css('meta[name="Generator"]').remove
38
+ # Remove abandoned anchors, that are not linked to.
39
+ @dom.css('a[name]').each do |a|
40
+ if @dom.css('a[href="#' + a['name'] + '"]').size == 0
41
+ puts "<a name=\"#{a['name']}\"> was removed, because it had no links to it."
42
+ a.replace(a.inner_html)
43
+ end
44
+ end
45
+ # Clean up h1-h6 tags
46
+ headings = @dom.css('h1, h2, h3, h4, h5, h6')
47
+ headings.each do |heading|
48
+ a = heading.at_css('a[name]')
49
+ if a
50
+ heading['id'] = a['name'].sub(/_Toc/, 'Toc')
51
+ a.replace(a.inner_html)
52
+ end
53
+ heading.inner_html = heading.inner_html.sub(/\A(\s*\d+\.?)+\uC2A0*/, '').strip
54
+ end
55
+ # Remove Words "normal" classes.
56
+ UNWANTED_CLASSES.each do |class_name|
57
+ @dom.css(".#{class_name}").each do |node|
58
+ node.remove_attribute('class')
59
+ end
60
+ end
61
+ # Remove unwanted section tags
62
+ @dom.css('.WordSection1, .WordSection2, .WordSection3, .WordSection4, .WordSection5, .WordSection6, .WordSection7, .WordSection8').each do |section|
63
+ puts "Removing #{section.name}.#{section['class']}"
64
+ section.replace(section.inner_html)
65
+ end
66
+ if options[:rebuild_toc]
67
+ # Remove page numbers from TOC
68
+ @dom.css('.MsoToc1 a, .MsoToc2 a, .MsoToc3 a, .MsoToc4 a').each do |item|
69
+ item.inner_html = item.inner_text.sub(/\A(\d+\.)+/, '').sub(/(\s+\d+)\Z/, '').strip
70
+ end
71
+ # Rewrite Toc as ordered list.
72
+ toc_item = @dom.at_css('.MsoToc1')
73
+ previous_toc_level = 0
74
+ new_toc = []
75
+ while toc_item
76
+ toc_item.inner_html = toc_item.inner_html.sub(/\n/, ' ')
77
+ class_attr = toc_item.attr('class')
78
+ current_toc_level = class_attr[6].to_i
79
+ new_toc << "</li>\n" if previous_toc_level == current_toc_level
80
+ new_toc << "</ol>\n</li>\n" if previous_toc_level > current_toc_level
81
+ new_toc << "\n<ol#{' id="toc"' if previous_toc_level == 0}>\n" if previous_toc_level < current_toc_level
82
+ link = toc_item.at_css('a')
83
+ if link.nil?
84
+ puts toc_item.to_s
85
+ else
86
+ toc_item.at_css('a').inner_html = link.inner_html.sub(/\A(\s*\d+)/, '').strip
87
+ new_toc << "<li>#{toc_item.inner_html.sub(/#_Toc/, '#Toc')}"
88
+ end
89
+ previous_toc_level = current_toc_level
90
+ begin
91
+ toc_item = toc_item.next_element
92
+ end while toc_item && toc_item.text?
93
+ toc_item = nil unless toc_item && toc_item.attr('class') && toc_item.attr('class').start_with?('MsoToc')
94
+ end
95
+ @dom.at_css('.MsoToc1').replace(new_toc.join('')) if @dom.at_css('.MsoToc1')
96
+ # Remove old Table of Contents
97
+ @dom.css('.MsoToc1, .MsoToc2, .MsoToc3, .MsoToc4').each { |item| item.remove }
98
+ end
99
+ # Remove empty paragraphs
100
+ @dom.css('p').each do |p|
101
+ if p.content.gsub("\uC2A0", '').strip.size == 0 && !p.at_css('img')
102
+ puts 'Removing empty paragraph.'
103
+ p.remove
104
+ end
105
+ end
106
+ @dom.css('table + br').remove
107
+ # /<!--\[if[.\n\r]+\[endif\]\s*-->/
108
+ end
109
+
110
+ # Remove script tags from the header
111
+ def remove_header_scripts
112
+ @dom.css('script').remove
113
+ end
114
+
115
+ # Change tracking in MS Word, adds a lot of ins and del tags. These tags are removed.
116
+ def accept_word_changes_tracked
117
+ @dom.css('del').remove
118
+ @dom.css('ins').each do |ins|
119
+ ins.replace ins.inner_html
120
+ end
121
+ end
122
+
123
+ # Change h1 to h2 and so on. h6 is not changed, so its a potential mess.
124
+ def demote_headings
125
+ @dom.css('h1, h2, h3, h4, h5').each do |heading|
126
+ heading.name = "h#{heading.name[1].to_i + 1}"
127
+ end
128
+ end
129
+
130
+ # Save the file with the same file name.
131
+ def save!(output_encoding = 'utf-8')
132
+ save_as!(@input_file_name, output_encoding)
133
+ end
134
+
135
+ # Save file with a new file name.
136
+ def save_as!(output_file_name, output_encoding = 'utf-8')
137
+ puts "W: #{output_file_name}"
138
+ begin
139
+ if File.writable?(output_file_name) || !File.exists?(output_file_name)
140
+ File.open(output_file_name, "w:#{output_encoding}", universal_newline: false) do |f|
141
+ f.write @dom.to_html({ encoding: output_encoding, indent: 2 })
142
+ end
143
+ else
144
+ $stderr.puts 'Failed: Read only!'
145
+ end
146
+ rescue StandardError => se
147
+ $stderr.puts "\nFailed!\n#{se.message}"
148
+ end
149
+ end
150
+
151
+ # Edit all HTML files in a folder.
152
+ def self.edit_folder(folder, &block)
153
+ Dir.open(folder.gsub(/\\/, '/')) do |d|
154
+ d.each do |file_name|
155
+ file_name = File.join(d.path, file_name)
156
+ if File.file? file_name
157
+ editor = new(file_name)
158
+ unless editor.dom.nil?
159
+ yield editor
160
+ editor.save!
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
166
+
167
+ # Edit files listed in a text file. File names are absolute.
168
+ # If the first character on a line is # the line is ignored.
169
+ def self.bulk_edit(file_list_file_name, &block)
170
+ txt_file_name = File.expand_path(file_list_file_name)
171
+ File.readlines(txt_file_name).each do |file_name|
172
+ unless file_name.start_with? '#'
173
+ # Strip added to remove trailing newline characters.
174
+ file_name.strip!
175
+ if File.file? file_name
176
+ editor = new(file_name)
177
+ if editor.dom.nil?
178
+ puts "No DOM found in #{file_name}."
179
+ else
180
+ yield editor
181
+ editor.save!
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end
187
+
188
+ end
189
+
190
+ if __FILE__ == $PROGRAM_NAME
191
+ HeadlessHtmlEditor.edit_folder(File.expand_path(ARGV[0])) do |editor|
192
+ editor.dom.at_css('html').add_child '<!-- HeadlessHtmlEditor was here! -->'
193
+ end
194
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: headless_html_editor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Bo Frederiksen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-10-03 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.6.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.6.0
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: Headless HTML Editor - edit HTML files, without a UI.
47
+ email:
48
+ - bofrede@bofrede.dk
49
+ executables: []
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - lib/headless_html_editor.rb
54
+ homepage: https://github.com/bofrede/headless_html_editor
55
+ licenses:
56
+ - MIT
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ requirements: []
74
+ rubyforge_project:
75
+ rubygems_version: 1.8.24
76
+ signing_key:
77
+ specification_version: 3
78
+ summary: Headless HTML Editor - edit HTML files, without a UI.
79
+ test_files: []