metanorma-tools 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/rake.yml +15 -0
- data/.github/workflows/release.yml +24 -0
- data/.gitignore +12 -0
- data/.rspec +2 -0
- data/.rubocop.yml +14 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/Gemfile +11 -0
- data/README.adoc +94 -0
- data/Rakefile +8 -0
- data/docs/figure-extraction.adoc +111 -0
- data/docs/iso-drg-filename-guidance.adoc +584 -0
- data/docs/workflows-iso.adoc +70 -0
- data/exe/metanorma-tools +6 -0
- data/lib/metanorma/tools/cli.rb +79 -0
- data/lib/metanorma/tools/commands/extract_images.rb +25 -0
- data/lib/metanorma/tools/commands.rb +8 -0
- data/lib/metanorma/tools/document_metadata.rb +40 -0
- data/lib/metanorma/tools/figure.rb +124 -0
- data/lib/metanorma/tools/figure_extractor.rb +384 -0
- data/lib/metanorma/tools/iso_graphic_filename.rb +149 -0
- data/lib/metanorma/tools/version.rb +7 -0
- data/lib/metanorma/tools.rb +18 -0
- data/metanorma-tools.gemspec +37 -0
- data/sig/metanorma/tools.rbs +6 -0
- data/spec/fixtures/document-en.dis.presentation.xml +3417 -0
- data/spec/metanorma/tools/cli_spec.rb +102 -0
- data/spec/metanorma/tools/document_metadata_spec.rb +308 -0
- data/spec/metanorma/tools/figure_extractor_spec.rb +265 -0
- data/spec/metanorma/tools/iso_graphic_filename_spec.rb +316 -0
- data/spec/metanorma/tools_spec.rb +15 -0
- data/spec/spec_helper.rb +16 -0
- metadata +148 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
|
5
|
+
module Metanorma
|
6
|
+
module Tools
|
7
|
+
class Cli < Thor
|
8
|
+
def self.exit_on_failure?
|
9
|
+
true
|
10
|
+
end
|
11
|
+
|
12
|
+
map %w[--version -v] => :__version
|
13
|
+
|
14
|
+
desc '--version, -v', 'Print the version'
|
15
|
+
def __version
|
16
|
+
puts Metanorma::Tools::VERSION
|
17
|
+
end
|
18
|
+
|
19
|
+
desc 'extract-images INPUT_XML', 'Extract embedded figures from Metanorma presentation XML files'
|
20
|
+
long_desc <<~DESC
|
21
|
+
Extract embedded figures from Metanorma presentation XML files with ISO DRG compliance features.
|
22
|
+
|
23
|
+
This tool automatically extracts document metadata, generates ISO DRG compliant filenames,
|
24
|
+
and supports both SVG and PNG figure formats with optional ZIP packaging.
|
25
|
+
|
26
|
+
For detailed usage examples and options, see: docs/figure-extraction.adoc
|
27
|
+
DESC
|
28
|
+
option :output_dir, type: :string, aliases: '-o', desc: 'Output directory for extracted figures'
|
29
|
+
option :prefix, type: :string, aliases: '-p', desc: 'Prefix for generated figure filenames'
|
30
|
+
option :zip, type: :boolean, default: false, desc: 'Create a ZIP archive of extracted figures'
|
31
|
+
option :verbose, type: :boolean, default: false, aliases: '-v', desc: 'Show detailed progress information'
|
32
|
+
option :retain_original_filenames, type: :boolean, default: false, desc: 'For ISO documents, retain original filenames in generated names'
|
33
|
+
|
34
|
+
def extract_images(input_xml)
|
35
|
+
extractor = FigureExtractor.new(options)
|
36
|
+
extractor.extract(input_xml, options[:output_dir], options[:prefix])
|
37
|
+
end
|
38
|
+
|
39
|
+
# Placeholder for future comment management functionality
|
40
|
+
# Will integrate metanorma/commenter gem functionality
|
41
|
+
desc 'comment SUBCOMMAND', 'Manage ISO comment sheets (planned)'
|
42
|
+
subcommand 'comment', CommentCli if defined?(CommentCli)
|
43
|
+
|
44
|
+
# Placeholder for future ISO document fetching functionality
|
45
|
+
desc 'fetch-iso DOCUMENT_ID', 'Fetch ISO documents from OBP (planned)'
|
46
|
+
def fetch_iso(document_id)
|
47
|
+
puts "ISO document fetching functionality is planned for future release."
|
48
|
+
puts "This will fetch ISO documents from the OBP into Metanorma format."
|
49
|
+
puts "Document ID: #{document_id}"
|
50
|
+
end
|
51
|
+
|
52
|
+
# Help command that shows the expanded purpose
|
53
|
+
desc 'help [COMMAND]', 'Show help for metanorma-tools commands'
|
54
|
+
def help(command = nil)
|
55
|
+
if command.nil?
|
56
|
+
puts <<~HELP
|
57
|
+
Metanorma Tools - Standards Editing Lifecycle Support
|
58
|
+
=====================================================
|
59
|
+
|
60
|
+
Metanorma Tools supports the lifecycle of standards editing for various flavors
|
61
|
+
to facilitate pre and post-compilation of Metanorma documents.
|
62
|
+
|
63
|
+
Available tools:
|
64
|
+
• Figure extraction - Extract embedded figures from Metanorma presentation XML files
|
65
|
+
• Comment management - Manage ISO comment sheets (planned)
|
66
|
+
• ISO document fetching - Fetch documents from OBP (planned)
|
67
|
+
|
68
|
+
For detailed documentation, see the docs/ directory:
|
69
|
+
• docs/figure-extraction.adoc - Figure extraction guide
|
70
|
+
• docs/iso-drg-filename-guidance.adoc - ISO DRG compliance guidance
|
71
|
+
|
72
|
+
Commands:
|
73
|
+
HELP
|
74
|
+
end
|
75
|
+
super
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
|
5
|
+
module Metanorma
|
6
|
+
module Tools
|
7
|
+
module Commands
|
8
|
+
class ExtractImages < Thor
|
9
|
+
desc "extract INPUT_XML", "Extract figures from Metanorma XML"
|
10
|
+
option :output_dir, type: :string, aliases: '-o', desc: 'Output directory for extracted figures'
|
11
|
+
option :prefix, type: :string, aliases: '-p', desc: 'Prefix for generated figure filenames'
|
12
|
+
option :zip, type: :boolean, default: false, desc: 'Create a ZIP archive of extracted figures'
|
13
|
+
option :verbose, type: :boolean, default: false, aliases: '-v', desc: 'Show detailed progress information'
|
14
|
+
option :auto_prefix, type: :boolean, default: true, desc: 'Automatically generate prefix from document metadata'
|
15
|
+
|
16
|
+
def extract(input_xml)
|
17
|
+
extractor = FigureExtractor.new(options)
|
18
|
+
extractor.extract(input_xml, options[:output_dir], options[:prefix])
|
19
|
+
end
|
20
|
+
|
21
|
+
default_task :extract
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'lutaml/model'
|
4
|
+
|
5
|
+
module Metanorma
|
6
|
+
module Tools
|
7
|
+
class DocumentMetadata < Lutaml::Model::Serializable
|
8
|
+
attribute :title, :string
|
9
|
+
attribute :docnumber, :string
|
10
|
+
attribute :stage, :string
|
11
|
+
attribute :substage, :string
|
12
|
+
attribute :docidentifier, :string
|
13
|
+
attribute :standard_number, :string
|
14
|
+
attribute :part_number, :string
|
15
|
+
attribute :edition, :string
|
16
|
+
attribute :stage_code, :string
|
17
|
+
attribute :stage_abbreviation, :string
|
18
|
+
attribute :flavor, :string, default: -> { 'iso' }
|
19
|
+
|
20
|
+
def auto_prefix
|
21
|
+
case flavor
|
22
|
+
when 'iso'
|
23
|
+
# ISO DRG format: {StandardNumber}_{stageCode}_ed{editionNumber}
|
24
|
+
"#{standard_number}_#{stage_abbreviation&.downcase}_ed#{edition}"
|
25
|
+
else
|
26
|
+
# For other flavors, use a generic pattern
|
27
|
+
"#{flavor}_#{standard_number}_#{stage_abbreviation&.downcase}_ed#{edition}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
if docidentifier
|
33
|
+
"#{docidentifier} - #{title}"
|
34
|
+
else
|
35
|
+
"ISO #{standard_number} Edition #{edition} Stage #{stage_code} (#{stage_abbreviation})"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'base64'
|
4
|
+
require 'fileutils'
|
5
|
+
require_relative 'iso_graphic_filename'
|
6
|
+
|
7
|
+
module Metanorma
|
8
|
+
module Tools
|
9
|
+
class Figure
|
10
|
+
FORMATS = {
|
11
|
+
datauri_png: { name: 'PNG', ext: 'png', binary: true },
|
12
|
+
datauri_jpeg: { name: 'JPEG', ext: 'jpg', binary: true },
|
13
|
+
datauri_gif: { name: 'GIF', ext: 'gif', binary: true },
|
14
|
+
datauri_svg: { name: 'SVG', ext: 'svg', binary: false },
|
15
|
+
datauri_webp: { name: 'WebP', ext: 'webp', binary: true },
|
16
|
+
svg: { name: 'SVG', ext: 'svg', binary: false }
|
17
|
+
}.freeze
|
18
|
+
|
19
|
+
attr_reader :autonum, :content, :format, :original_filename, :file_size
|
20
|
+
|
21
|
+
def initialize(autonum, content, format, original_filename = nil)
|
22
|
+
@autonum = autonum
|
23
|
+
@content = content
|
24
|
+
@format = format
|
25
|
+
@original_filename = original_filename
|
26
|
+
@file_size = calculate_size
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_file(output_dir, prefix, document_metadata = nil, retain_original_filenames = false)
|
30
|
+
FileUtils.mkdir_p(output_dir)
|
31
|
+
|
32
|
+
filename = generate_filename(prefix, document_metadata, retain_original_filenames)
|
33
|
+
filepath = File.join(output_dir, filename)
|
34
|
+
|
35
|
+
write_content(filepath)
|
36
|
+
puts " Saved: #{filename}"
|
37
|
+
filepath
|
38
|
+
end
|
39
|
+
|
40
|
+
def format_name
|
41
|
+
FORMATS.dig(@format, :name) || @format.to_s.upcase
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def calculate_size
|
47
|
+
case @format
|
48
|
+
when *FORMATS.keys.select { |k| k.to_s.start_with?('datauri_') }
|
49
|
+
Base64.decode64(@content).bytesize
|
50
|
+
when :svg
|
51
|
+
@content.bytesize
|
52
|
+
else
|
53
|
+
0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def write_content(filepath)
|
58
|
+
format_info = FORMATS[@format]
|
59
|
+
return unless format_info
|
60
|
+
|
61
|
+
mode = format_info[:binary] ? 'wb' : 'w'
|
62
|
+
encoding = format_info[:binary] ? nil : 'utf-8'
|
63
|
+
|
64
|
+
content_to_write = if @format.to_s.start_with?('datauri_')
|
65
|
+
Base64.decode64(@content)
|
66
|
+
else
|
67
|
+
@content
|
68
|
+
end
|
69
|
+
|
70
|
+
File.open(filepath, mode, encoding: encoding) do |file|
|
71
|
+
file.write(content_to_write)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def generate_filename(prefix, document_metadata = nil, retain_original_filenames = false)
|
76
|
+
format_info = FORMATS[@format]
|
77
|
+
extension = format_info ? format_info[:ext] : 'unknown'
|
78
|
+
|
79
|
+
# If we have document metadata, use proper ISO DRG filename generation
|
80
|
+
if document_metadata
|
81
|
+
# Parse subfigure from autonum (e.g., "C.2 a" -> figure: "C.2", subfigure: "a")
|
82
|
+
figure_number, subfigure = parse_figure_number(@autonum)
|
83
|
+
|
84
|
+
# Only include original filename if retain_original_filenames is true and we have an original filename
|
85
|
+
original_filename_to_use = (retain_original_filenames && @original_filename && !@original_filename.empty?) ? @original_filename : nil
|
86
|
+
|
87
|
+
iso_filename = IsoGraphicFilename.new(
|
88
|
+
standard_number: document_metadata.standard_number&.to_i,
|
89
|
+
part_number: document_metadata.part_number&.to_i,
|
90
|
+
edition_number: document_metadata.edition&.to_i,
|
91
|
+
stage_code: document_metadata.stage_code,
|
92
|
+
content_type: 'figure',
|
93
|
+
figure_number: figure_number,
|
94
|
+
subfigure: subfigure,
|
95
|
+
file_extension: extension,
|
96
|
+
original_filename: original_filename_to_use
|
97
|
+
)
|
98
|
+
|
99
|
+
return iso_filename.generate_filename
|
100
|
+
end
|
101
|
+
|
102
|
+
# Fallback to simple prefix-based naming
|
103
|
+
sanitized_autonum = @autonum.gsub('.', '')
|
104
|
+
if @original_filename && !@original_filename.empty?
|
105
|
+
basename = File.basename(@original_filename, File.extname(@original_filename))
|
106
|
+
"#{prefix}fig#{sanitized_autonum}_#{basename}.#{extension}"
|
107
|
+
else
|
108
|
+
"#{prefix}fig#{sanitized_autonum}.#{extension}"
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def parse_figure_number(autonum)
|
113
|
+
# Handle cases like "C.2 a", "A.1", "3", etc.
|
114
|
+
if autonum.match(/^(.+?)\s+([a-z])$/)
|
115
|
+
# Has subfigure: "C.2 a" -> ["C.2", "a"]
|
116
|
+
[$1, $2]
|
117
|
+
else
|
118
|
+
# No subfigure: "C.2" -> ["C.2", nil]
|
119
|
+
[autonum, nil]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,384 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'base64'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'zip'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
module Metanorma
|
10
|
+
module Tools
|
11
|
+
class FigureExtractor
|
12
|
+
METANORMA_NS = { 'xmlns' => 'https://www.metanorma.org/ns/standoc' }.freeze
|
13
|
+
|
14
|
+
MIMETYPE_FORMATS = {
|
15
|
+
'image/png' => :datauri_png,
|
16
|
+
'image/jpeg' => :datauri_jpeg,
|
17
|
+
'image/jpg' => :datauri_jpeg,
|
18
|
+
'image/gif' => :datauri_gif,
|
19
|
+
'image/svg+xml' => :datauri_svg,
|
20
|
+
'image/webp' => :datauri_webp,
|
21
|
+
'' => :datauri_png # Default for empty mimetype
|
22
|
+
}.freeze
|
23
|
+
|
24
|
+
attr_reader :options
|
25
|
+
|
26
|
+
def initialize(options = {})
|
27
|
+
# Convert string keys to symbols for consistency
|
28
|
+
normalized_options = options.transform_keys(&:to_sym)
|
29
|
+
@options = {
|
30
|
+
zip: false,
|
31
|
+
verbose: false
|
32
|
+
}.merge(normalized_options)
|
33
|
+
end
|
34
|
+
|
35
|
+
def extract(input_xml, output_dir = nil, prefix = nil)
|
36
|
+
validate_input(input_xml)
|
37
|
+
|
38
|
+
doc = parse_xml(input_xml)
|
39
|
+
metadata = extract_document_metadata(doc)
|
40
|
+
prefix = determine_prefix(prefix, metadata)
|
41
|
+
output_dir = determine_output_dir(output_dir, prefix)
|
42
|
+
|
43
|
+
figures = find_figures(doc)
|
44
|
+
return if figures.empty?
|
45
|
+
|
46
|
+
figure_objects, format_counts, total_size = process_figures(figures)
|
47
|
+
|
48
|
+
saved_files = if options[:zip]
|
49
|
+
extract_to_zip(figure_objects, output_dir, prefix, metadata)
|
50
|
+
else
|
51
|
+
extract_to_directory(figure_objects, output_dir, prefix, metadata)
|
52
|
+
end
|
53
|
+
|
54
|
+
print_summary(metadata, prefix, figure_objects.length, format_counts, total_size, output_dir)
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def validate_input(input_xml)
|
60
|
+
return if File.exist?(input_xml)
|
61
|
+
|
62
|
+
puts "Error: Input file '#{input_xml}' does not exist."
|
63
|
+
exit 1
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse_xml(input_xml)
|
67
|
+
puts "Reading XML file: #{input_xml}"
|
68
|
+
File.open(input_xml) { |f| Nokogiri::XML(f) }
|
69
|
+
rescue StandardError => e
|
70
|
+
puts "Error processing file: #{e.message}"
|
71
|
+
puts e.backtrace if options[:verbose]
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
|
75
|
+
def determine_prefix(prefix, metadata)
|
76
|
+
if prefix.nil? && metadata
|
77
|
+
prefix = metadata.auto_prefix
|
78
|
+
puts "Auto-generated prefix: #{prefix}"
|
79
|
+
end
|
80
|
+
|
81
|
+
if prefix.nil? || prefix.empty?
|
82
|
+
prefix = 'figure'
|
83
|
+
puts "Using default prefix: #{prefix}"
|
84
|
+
end
|
85
|
+
|
86
|
+
prefix
|
87
|
+
end
|
88
|
+
|
89
|
+
def determine_output_dir(output_dir, prefix)
|
90
|
+
if output_dir.nil? || output_dir.empty?
|
91
|
+
if options[:zip]
|
92
|
+
# For ZIP mode, use current directory
|
93
|
+
output_dir = Dir.pwd
|
94
|
+
puts "Using current directory for ZIP output: #{output_dir}"
|
95
|
+
else
|
96
|
+
# For directory mode, use auto-prefix as directory name
|
97
|
+
output_dir = prefix
|
98
|
+
puts "Using auto-generated output directory: #{output_dir}"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
output_dir
|
102
|
+
end
|
103
|
+
|
104
|
+
def extract_to_directory(figure_objects, output_dir, prefix, metadata = nil)
|
105
|
+
# Always extract to temporary directory first, then move to destination
|
106
|
+
Dir.mktmpdir('metanorma_figures_') do |temp_dir|
|
107
|
+
puts "\nExtracting #{figure_objects.length} figures to temporary directory: #{temp_dir}"
|
108
|
+
|
109
|
+
retain_original_filenames = should_retain_original_filenames?(metadata)
|
110
|
+
temp_files = figure_objects.map { |figure_obj| figure_obj.to_file(temp_dir, prefix, metadata, retain_original_filenames) }
|
111
|
+
|
112
|
+
# Ensure output directory exists
|
113
|
+
FileUtils.mkdir_p(output_dir)
|
114
|
+
|
115
|
+
# Move files from temp to final destination
|
116
|
+
puts "Moving files to final destination: #{output_dir}"
|
117
|
+
final_files = []
|
118
|
+
temp_files.each do |temp_file|
|
119
|
+
filename = File.basename(temp_file)
|
120
|
+
final_path = File.join(output_dir, filename)
|
121
|
+
FileUtils.mv(temp_file, final_path)
|
122
|
+
final_files << final_path
|
123
|
+
puts " Moved: #{filename}"
|
124
|
+
end
|
125
|
+
|
126
|
+
final_files
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def extract_to_zip(figure_objects, output_dir, prefix, metadata = nil)
|
131
|
+
# Extract to temporary directory and create ZIP in output directory
|
132
|
+
zip_filename = "#{prefix}.zip"
|
133
|
+
FileUtils.mkdir_p(output_dir) if output_dir
|
134
|
+
zip_path = File.join(output_dir || Dir.pwd, zip_filename)
|
135
|
+
|
136
|
+
Dir.mktmpdir('metanorma_figures_zip_') do |temp_dir|
|
137
|
+
puts "\nExtracting #{figure_objects.length} figures to temporary directory for ZIP: #{temp_dir}"
|
138
|
+
|
139
|
+
retain_original_filenames = should_retain_original_filenames?(metadata)
|
140
|
+
temp_files = figure_objects.map { |figure_obj| figure_obj.to_file(temp_dir, prefix, metadata, retain_original_filenames) }
|
141
|
+
|
142
|
+
puts "Creating ZIP archive: #{zip_filename}"
|
143
|
+
Zip::File.open(zip_path, Zip::File::CREATE) do |zipfile|
|
144
|
+
temp_files.each do |temp_file|
|
145
|
+
filename = File.basename(temp_file)
|
146
|
+
zipfile.add(filename, temp_file)
|
147
|
+
puts " Added to ZIP: #{filename}"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
puts "ZIP archive created: #{zip_path}"
|
152
|
+
[zip_path]
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def find_figures(doc)
|
157
|
+
figures = doc.xpath('//xmlns:figure', METANORMA_NS)
|
158
|
+
puts "Found #{figures.length} figures"
|
159
|
+
|
160
|
+
if figures.empty?
|
161
|
+
puts 'No figures found in the document'
|
162
|
+
exit 0
|
163
|
+
end
|
164
|
+
|
165
|
+
figures
|
166
|
+
end
|
167
|
+
|
168
|
+
def process_figures(figures)
|
169
|
+
figure_objects = []
|
170
|
+
format_counts = Hash.new(0)
|
171
|
+
total_size = 0
|
172
|
+
|
173
|
+
figures.each_with_index do |figure_element, index|
|
174
|
+
figure_obj = process_single_figure(figure_element, index)
|
175
|
+
next unless figure_obj
|
176
|
+
|
177
|
+
figure_objects << figure_obj
|
178
|
+
format_counts[figure_obj.format_name] += 1
|
179
|
+
total_size += figure_obj.file_size
|
180
|
+
end
|
181
|
+
|
182
|
+
[figure_objects, format_counts, total_size]
|
183
|
+
end
|
184
|
+
|
185
|
+
def process_single_figure(figure_element, index)
|
186
|
+
autonum = figure_element['autonum']
|
187
|
+
|
188
|
+
unless autonum&.strip&.length&.positive?
|
189
|
+
puts "Warning: Skipping figure #{index + 1} - missing autonum" if options[:verbose]
|
190
|
+
return nil
|
191
|
+
end
|
192
|
+
|
193
|
+
image_element = figure_element.xpath('.//xmlns:image', METANORMA_NS).first
|
194
|
+
unless image_element
|
195
|
+
puts "Warning: Skipping figure #{index + 1} (autonum: #{autonum}) - no image element" if options[:verbose]
|
196
|
+
return nil
|
197
|
+
end
|
198
|
+
|
199
|
+
create_figure_from_image(image_element, autonum)
|
200
|
+
end
|
201
|
+
|
202
|
+
def create_figure_from_image(image_element, autonum)
|
203
|
+
src = image_element['src']
|
204
|
+
filename = image_element['filename']
|
205
|
+
mimetype = image_element['mimetype']
|
206
|
+
|
207
|
+
if src&.start_with?('data:')
|
208
|
+
create_data_uri_figure(src, filename, autonum)
|
209
|
+
elsif mimetype == 'image/svg+xml' || filename&.end_with?('.svg')
|
210
|
+
create_svg_figure(image_element, src, autonum)
|
211
|
+
else
|
212
|
+
log_unsupported_figure(autonum, mimetype, src)
|
213
|
+
nil
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def create_data_uri_figure(src, filename, autonum)
|
218
|
+
data_uri_info = parse_data_uri(src)
|
219
|
+
unless data_uri_info
|
220
|
+
puts "Warning: Skipping figure #{autonum} - malformed data URI" if options[:verbose]
|
221
|
+
return nil
|
222
|
+
end
|
223
|
+
|
224
|
+
puts " Figure #{autonum}: Data URI #{data_uri_info[:format_name]}"
|
225
|
+
Figure.new(autonum, data_uri_info[:content], data_uri_info[:format], filename)
|
226
|
+
end
|
227
|
+
|
228
|
+
def create_svg_figure(image_element, src, autonum)
|
229
|
+
svg_content = image_element.inner_html
|
230
|
+
if svg_content.empty?
|
231
|
+
puts "Warning: Skipping figure #{autonum} - empty SVG content" if options[:verbose]
|
232
|
+
return nil
|
233
|
+
end
|
234
|
+
|
235
|
+
original_filename = src unless src&.start_with?('data:')
|
236
|
+
puts " Figure #{autonum}: SVG#{original_filename ? " (#{File.basename(original_filename)})" : ''}"
|
237
|
+
Figure.new(autonum, svg_content, :svg, original_filename)
|
238
|
+
end
|
239
|
+
|
240
|
+
def log_unsupported_figure(autonum, mimetype, src)
|
241
|
+
return unless options[:verbose]
|
242
|
+
|
243
|
+
if mimetype && src
|
244
|
+
puts "Warning: Skipping figure #{autonum} - external file not supported: #{File.basename(src)}"
|
245
|
+
else
|
246
|
+
puts "Warning: Skipping figure #{autonum} - no valid source or mimetype found"
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
def print_summary(metadata, prefix, total_figures, format_counts, total_size, output_dir)
|
251
|
+
puts "\n" + '=' * 60
|
252
|
+
puts 'EXTRACTION SUMMARY'
|
253
|
+
puts '=' * 60
|
254
|
+
|
255
|
+
if metadata
|
256
|
+
puts "Document: #{metadata}"
|
257
|
+
puts "Auto-generated prefix: #{metadata.auto_prefix}" if options[:auto_prefix]
|
258
|
+
end
|
259
|
+
|
260
|
+
puts "File prefix used: #{prefix}"
|
261
|
+
puts "Total figures extracted: #{total_figures}"
|
262
|
+
|
263
|
+
format_counts.each { |format, count| puts "#{format} files: #{count}" }
|
264
|
+
|
265
|
+
puts "Total size: #{format_bytes(total_size)}"
|
266
|
+
puts "Output directory: #{output_dir}"
|
267
|
+
puts "ZIP archive: #{options[:zip] ? 'Created' : 'Not requested'}"
|
268
|
+
|
269
|
+
print_compliance_info(format_counts, metadata)
|
270
|
+
puts '=' * 60
|
271
|
+
puts "\nSuccessfully extracted #{total_figures} figures to #{output_dir}"
|
272
|
+
end
|
273
|
+
|
274
|
+
def print_compliance_info(format_counts, metadata)
|
275
|
+
svg_count = format_counts['SVG'] || 0
|
276
|
+
puts "\nISO DRG COMPLIANCE:"
|
277
|
+
puts "✓ Revisable vector graphics (SVG): #{svg_count > 0 ? 'Yes' : 'No'}"
|
278
|
+
puts '✓ Proper file naming convention: Yes'
|
279
|
+
puts '✓ Language-neutral graphics: Yes (extracted from Metanorma)'
|
280
|
+
puts "✓ Document metadata extraction: #{metadata ? 'Yes' : 'No'}"
|
281
|
+
end
|
282
|
+
|
283
|
+
def extract_document_metadata(doc)
|
284
|
+
# Extract flavor from root metanorma element
|
285
|
+
metanorma_element = doc.xpath('/xmlns:metanorma', METANORMA_NS).first
|
286
|
+
flavor = metanorma_element&.[]('flavor') || 'iso' # Default to 'iso' for compatibility
|
287
|
+
|
288
|
+
bibdata = doc.xpath('//xmlns:bibdata', METANORMA_NS).first
|
289
|
+
return nil unless bibdata
|
290
|
+
|
291
|
+
# Extract basic document information
|
292
|
+
title = xpath_text(bibdata, './/xmlns:title[@type="main"][@language=""]') ||
|
293
|
+
xpath_text(bibdata, './/xmlns:title[@type="main"]') ||
|
294
|
+
xpath_text(bibdata, './/xmlns:title')
|
295
|
+
|
296
|
+
docnumber = xpath_text(bibdata, './/xmlns:docnumber')
|
297
|
+
|
298
|
+
# Extract docidentifier
|
299
|
+
docidentifier = xpath_text(bibdata, './/xmlns:docidentifier[@type="ISO"]') ||
|
300
|
+
xpath_text(bibdata, './/xmlns:docidentifier')
|
301
|
+
|
302
|
+
# Extract stage information
|
303
|
+
stage_element = bibdata.xpath('.//xmlns:status/xmlns:stage[@language=""]', METANORMA_NS).first
|
304
|
+
stage = stage_element&.text&.strip
|
305
|
+
stage_abbreviation = stage_element&.[]('abbreviation')
|
306
|
+
|
307
|
+
substage = xpath_text(bibdata, './/xmlns:status/xmlns:substage')
|
308
|
+
|
309
|
+
# Extract edition
|
310
|
+
edition = xpath_text(bibdata, './/xmlns:edition[@language=""]') ||
|
311
|
+
xpath_text(bibdata, './/xmlns:edition')
|
312
|
+
|
313
|
+
# Parse part number from docnumber (e.g., "17301-1" -> part_number: "1")
|
314
|
+
standard_number = docnumber
|
315
|
+
part_number = nil
|
316
|
+
if docnumber&.include?('-')
|
317
|
+
parts = docnumber.split('-', 2)
|
318
|
+
standard_number = parts[0]
|
319
|
+
part_number = parts[1]
|
320
|
+
end
|
321
|
+
|
322
|
+
# Create stage code from stage abbreviation
|
323
|
+
stage_code = stage_abbreviation&.downcase
|
324
|
+
|
325
|
+
DocumentMetadata.new(
|
326
|
+
title: title,
|
327
|
+
docnumber: docnumber,
|
328
|
+
stage: stage,
|
329
|
+
substage: substage,
|
330
|
+
docidentifier: docidentifier,
|
331
|
+
standard_number: standard_number,
|
332
|
+
part_number: part_number,
|
333
|
+
edition: edition,
|
334
|
+
stage_code: stage_code,
|
335
|
+
stage_abbreviation: stage_abbreviation,
|
336
|
+
flavor: flavor
|
337
|
+
)
|
338
|
+
end
|
339
|
+
|
340
|
+
def xpath_text(element, xpath)
|
341
|
+
element.xpath(xpath, METANORMA_NS).first&.text&.strip
|
342
|
+
end
|
343
|
+
|
344
|
+
def parse_data_uri(data_uri)
|
345
|
+
return nil unless data_uri.start_with?('data:')
|
346
|
+
|
347
|
+
uri_content = data_uri[5..-1]
|
348
|
+
parts = uri_content.split(',', 2)
|
349
|
+
return nil if parts.length != 2
|
350
|
+
|
351
|
+
header, data = parts
|
352
|
+
mimetype = header.split(';').first
|
353
|
+
format = MIMETYPE_FORMATS[mimetype.downcase] || :datauri_png
|
354
|
+
format_name = Figure::FORMATS.dig(format, :name) || 'PNG'
|
355
|
+
|
356
|
+
{
|
357
|
+
format: format,
|
358
|
+
format_name: format_name,
|
359
|
+
content: data
|
360
|
+
}
|
361
|
+
rescue StandardError
|
362
|
+
nil
|
363
|
+
end
|
364
|
+
|
365
|
+
def should_retain_original_filenames?(metadata)
|
366
|
+
# Only apply retain_original_filenames option for ISO documents
|
367
|
+
options[:retain_original_filenames] && metadata&.flavor&.downcase == 'iso'
|
368
|
+
end
|
369
|
+
|
370
|
+
def format_bytes(bytes)
|
371
|
+
units = %w[B KB MB GB]
|
372
|
+
size = bytes.to_f
|
373
|
+
unit_index = 0
|
374
|
+
|
375
|
+
while size >= 1024 && unit_index < units.length - 1
|
376
|
+
size /= 1024
|
377
|
+
unit_index += 1
|
378
|
+
end
|
379
|
+
|
380
|
+
"#{size.round(2)} #{units[unit_index]}"
|
381
|
+
end
|
382
|
+
end
|
383
|
+
end
|
384
|
+
end
|