dita-cleanup 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dita/__init__.py +0 -0
- dita/cleanup/__init__.py +36 -0
- dita/cleanup/__main__.py +28 -0
- dita/cleanup/cli.py +212 -0
- dita/cleanup/out.py +38 -0
- dita/cleanup/xml.py +224 -0
- dita_cleanup-0.9.0.dist-info/METADATA +102 -0
- dita_cleanup-0.9.0.dist-info/RECORD +12 -0
- dita_cleanup-0.9.0.dist-info/WHEEL +5 -0
- dita_cleanup-0.9.0.dist-info/entry_points.txt +2 -0
- dita_cleanup-0.9.0.dist-info/licenses/LICENSE +21 -0
- dita_cleanup-0.9.0.dist-info/top_level.txt +1 -0
dita/__init__.py
ADDED
|
File without changes
|
dita/cleanup/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright (C) 2025 Jaromir Hradilek
|
|
2
|
+
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
# a copy of this software and associated documentation files (the "Soft-
|
|
7
|
+
# ware"), to deal in the Software without restriction, including without
|
|
8
|
+
# limitation the rights to use, copy, modify, merge, publish, distribute,
|
|
9
|
+
# sublicense, and/or sell copies of the Software, and to permit persons to
|
|
10
|
+
# whom the Software is furnished to do so, subject to the following condi-
|
|
11
|
+
# tions:
|
|
12
|
+
#
|
|
13
|
+
# The above copyright notice and this permission notice shall be included
|
|
14
|
+
# in all copies or substantial portions of the Software.
|
|
15
|
+
#
|
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
17
|
+
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABI-
|
|
18
|
+
# LITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
|
19
|
+
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
|
|
20
|
+
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
21
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
22
|
+
# OTHER DEALINGS IN THE SOFTWARE.
|
|
23
|
+
|
|
24
|
+
"""Clean up DITA topics after conversion from AsciiDoc."""
|
|
25
|
+
|
|
26
|
+
# Module metadata:
|
|
27
|
+
__author__ = 'Jaromir Hradilek'
|
|
28
|
+
__copyright__ = 'Copyright (C) 2025 Jaromir Hradilek'
|
|
29
|
+
__license__ = 'MIT License'
|
|
30
|
+
__description__ = __doc__
|
|
31
|
+
__version__ = '0.9.0'
|
|
32
|
+
|
|
33
|
+
# Expose general information about the project:
|
|
34
|
+
NAME = 'dita-cleanup'
|
|
35
|
+
VERSION = __version__
|
|
36
|
+
DESCRIPTION = __doc__
|
dita/cleanup/__main__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Copyright (C) 2025 Jaromir Hradilek
|
|
2
|
+
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
# a copy of this software and associated documentation files (the "Soft-
|
|
7
|
+
# ware"), to deal in the Software without restriction, including without
|
|
8
|
+
# limitation the rights to use, copy, modify, merge, publish, distribute,
|
|
9
|
+
# sublicense, and/or sell copies of the Software, and to permit persons to
|
|
10
|
+
# whom the Software is furnished to do so, subject to the following condi-
|
|
11
|
+
# tions:
|
|
12
|
+
#
|
|
13
|
+
# The above copyright notice and this permission notice shall be included
|
|
14
|
+
# in all copies or substantial portions of the Software.
|
|
15
|
+
#
|
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
17
|
+
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABI-
|
|
18
|
+
# LITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
|
19
|
+
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
|
|
20
|
+
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
21
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
22
|
+
# OTHER DEALINGS IN THE SOFTWARE.
|
|
23
|
+
|
|
24
|
+
import sys
|
|
25
|
+
from . import cli
|
|
26
|
+
|
|
27
|
+
if __name__ == '__main__':
|
|
28
|
+
cli.run()
|
dita/cleanup/cli.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# Copyright (C) 2025 Jaromir Hradilek
|
|
2
|
+
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
# a copy of this software and associated documentation files (the "Soft-
|
|
7
|
+
# ware"), to deal in the Software without restriction, including without
|
|
8
|
+
# limitation the rights to use, copy, modify, merge, publish, distribute,
|
|
9
|
+
# sublicense, and/or sell copies of the Software, and to permit persons to
|
|
10
|
+
# whom the Software is furnished to do so, subject to the following condi-
|
|
11
|
+
# tions:
|
|
12
|
+
#
|
|
13
|
+
# The above copyright notice and this permission notice shall be included
|
|
14
|
+
# in all copies or substantial portions of the Software.
|
|
15
|
+
#
|
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
17
|
+
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABI-
|
|
18
|
+
# LITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
|
19
|
+
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
|
|
20
|
+
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
21
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
22
|
+
# OTHER DEALINGS IN THE SOFTWARE.
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
import sys
|
|
26
|
+
|
|
27
|
+
from errno import EPERM
|
|
28
|
+
from lxml import etree
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from . import NAME, VERSION, DESCRIPTION
|
|
31
|
+
from .out import warn
|
|
32
|
+
from .xml import replace_attributes, update_image_paths, prune_ids, \
|
|
33
|
+
prune_includes, list_ids, update_xref_targets
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
'run'
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
def list_files(directory: str) -> list[Path]:
|
|
40
|
+
result: list[Path] = []
|
|
41
|
+
for root, dirs, files in Path(directory).walk(top_down=True, on_error=print):
|
|
42
|
+
for name in files:
|
|
43
|
+
if name.endswith('.dita') or name.endswith('.xml'):
|
|
44
|
+
result.append(Path(root, name))
|
|
45
|
+
return result
|
|
46
|
+
|
|
47
|
+
def catalog_ids(directory: str) -> dict[str, list[str]]:
|
|
48
|
+
result: dict[str, list[str]] = {}
|
|
49
|
+
|
|
50
|
+
file_list = list_files(directory)
|
|
51
|
+
|
|
52
|
+
for file_path in file_list:
|
|
53
|
+
try:
|
|
54
|
+
xml = etree.parse(file_path)
|
|
55
|
+
except (etree.XMLSyntaxError, OSError) as message:
|
|
56
|
+
warn(str(message))
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
id_list = list_ids(xml)
|
|
60
|
+
topic_id = id_list[0]
|
|
61
|
+
|
|
62
|
+
for xml_id in id_list:
|
|
63
|
+
if xml_id in result:
|
|
64
|
+
warn(str(file_path) + ": Duplicate ID: " + xml_id)
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
result[xml_id] = [topic_id, str(file_path)]
|
|
68
|
+
|
|
69
|
+
return result
|
|
70
|
+
|
|
71
|
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
72
|
+
parser = argparse.ArgumentParser(prog=NAME,
|
|
73
|
+
description=DESCRIPTION,
|
|
74
|
+
add_help=False)
|
|
75
|
+
|
|
76
|
+
parser._optionals.title = 'Options'
|
|
77
|
+
parser._positionals.title = 'Arguments'
|
|
78
|
+
|
|
79
|
+
parser.add_argument('-C', '--conref-target',
|
|
80
|
+
default=False,
|
|
81
|
+
metavar='TARGET',
|
|
82
|
+
help='replace attribute references with reusable content references')
|
|
83
|
+
parser.add_argument('-D', '--images-dir',
|
|
84
|
+
default=False,
|
|
85
|
+
metavar='DIRECTORY',
|
|
86
|
+
help='add a directory path to all image targets')
|
|
87
|
+
parser.add_argument('-X', '--xref-dir',
|
|
88
|
+
default=False,
|
|
89
|
+
metavar='DIRECTORY',
|
|
90
|
+
help='update all cross references based on the supplied files')
|
|
91
|
+
parser.add_argument('-i', '--prune-ids',
|
|
92
|
+
default=False,
|
|
93
|
+
action='store_true',
|
|
94
|
+
help='remove invalid content from element IDs')
|
|
95
|
+
parser.add_argument('-I', '--prune-includes',
|
|
96
|
+
default=False,
|
|
97
|
+
action='store_true',
|
|
98
|
+
help='remove unresolved include statements')
|
|
99
|
+
|
|
100
|
+
out = parser.add_mutually_exclusive_group()
|
|
101
|
+
out.add_argument('-o', '--output',
|
|
102
|
+
default=False,
|
|
103
|
+
metavar='FILE',
|
|
104
|
+
help='write output to the selected file instead of overwriting the file')
|
|
105
|
+
|
|
106
|
+
info = parser.add_mutually_exclusive_group()
|
|
107
|
+
info.add_argument('-h', '--help',
|
|
108
|
+
action='help',
|
|
109
|
+
help='display this help and exit')
|
|
110
|
+
info.add_argument('-v', '--version',
|
|
111
|
+
action='version',
|
|
112
|
+
version=f'{NAME} {VERSION}',
|
|
113
|
+
help='display version information and exit')
|
|
114
|
+
|
|
115
|
+
parser.add_argument('files', metavar='FILE',
|
|
116
|
+
default='-',
|
|
117
|
+
nargs='+',
|
|
118
|
+
help='specify the DITA files to clean up')
|
|
119
|
+
|
|
120
|
+
args = parser.parse_args(argv)
|
|
121
|
+
|
|
122
|
+
if args.files[0] == '-':
|
|
123
|
+
args.files = [sys.stdin]
|
|
124
|
+
args.output = sys.stdout
|
|
125
|
+
if args.output == '-':
|
|
126
|
+
args.output = sys.stdout
|
|
127
|
+
|
|
128
|
+
return args
|
|
129
|
+
|
|
130
|
+
def process_files(args: argparse.Namespace) -> int:
|
|
131
|
+
exit_code = 0
|
|
132
|
+
|
|
133
|
+
for file_path in args.files:
|
|
134
|
+
try:
|
|
135
|
+
xml = etree.parse(file_path)
|
|
136
|
+
except (etree.XMLSyntaxError, OSError) as message:
|
|
137
|
+
warn(str(message))
|
|
138
|
+
exit_code = EPERM
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
updated = False
|
|
142
|
+
|
|
143
|
+
if args.conref_target and replace_attributes(xml, args.conref_target.strip()):
|
|
144
|
+
updated = True
|
|
145
|
+
|
|
146
|
+
if args.images_dir and update_image_paths(xml, args.images_dir.strip()):
|
|
147
|
+
updated = True
|
|
148
|
+
|
|
149
|
+
if args.prune_ids and prune_ids(xml):
|
|
150
|
+
updated = True
|
|
151
|
+
|
|
152
|
+
if args.prune_includes and prune_includes(xml):
|
|
153
|
+
updated = True
|
|
154
|
+
|
|
155
|
+
if args.output == sys.stdout:
|
|
156
|
+
if not args.xref_dir:
|
|
157
|
+
sys.stdout.write(etree.tostring(xml, encoding='unicode'))
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
if args.output:
|
|
161
|
+
file_path = args.output
|
|
162
|
+
elif not updated:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
xml.write(file_path)
|
|
167
|
+
except OSError as message:
|
|
168
|
+
warn(str(message))
|
|
169
|
+
exit_code = EPERM
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
if not args.xref_dir:
|
|
173
|
+
return exit_code
|
|
174
|
+
|
|
175
|
+
xml_ids = catalog_ids(args.xref_dir)
|
|
176
|
+
|
|
177
|
+
for file_path in args.files:
|
|
178
|
+
try:
|
|
179
|
+
xml = etree.parse(file_path)
|
|
180
|
+
except (etree.XMLSyntaxError, OSError) as message:
|
|
181
|
+
warn(str(message))
|
|
182
|
+
exit_code = EPERM
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
updated = update_xref_targets(xml, xml_ids, str(file_path))
|
|
186
|
+
|
|
187
|
+
if args.output == sys.stdout:
|
|
188
|
+
sys.stdout.write(etree.tostring(xml, encoding='unicode'))
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
if args.output:
|
|
192
|
+
file_path = args.output
|
|
193
|
+
elif not updated:
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
xml.write(file_path)
|
|
198
|
+
except OSError as message:
|
|
199
|
+
warn(str(message))
|
|
200
|
+
exit_code = EPERM
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
return exit_code
|
|
204
|
+
|
|
205
|
+
def run(argv: list[str] | None = None) -> None:
|
|
206
|
+
try:
|
|
207
|
+
args = parse_args(argv)
|
|
208
|
+
exit_code = process_files(args)
|
|
209
|
+
except KeyboardInterrupt:
|
|
210
|
+
sys.exit(130)
|
|
211
|
+
|
|
212
|
+
sys.exit(exit_code)
|
dita/cleanup/out.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Copyright (C) 2025 Jaromir Hradilek
|
|
2
|
+
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
# a copy of this software and associated documentation files (the "Soft-
|
|
7
|
+
# ware"), to deal in the Software without restriction, including without
|
|
8
|
+
# limitation the rights to use, copy, modify, merge, publish, distribute,
|
|
9
|
+
# sublicense, and/or sell copies of the Software, and to permit persons to
|
|
10
|
+
# whom the Software is furnished to do so, subject to the following condi-
|
|
11
|
+
# tions:
|
|
12
|
+
#
|
|
13
|
+
# The above copyright notice and this permission notice shall be included
|
|
14
|
+
# in all copies or substantial portions of the Software.
|
|
15
|
+
#
|
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
17
|
+
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABI-
|
|
18
|
+
# LITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
|
19
|
+
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
|
|
20
|
+
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
21
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
22
|
+
# OTHER DEALINGS IN THE SOFTWARE.
|
|
23
|
+
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
from errno import EPERM
|
|
27
|
+
from . import NAME
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
'exit_with_error', 'warn'
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
def exit_with_error(error_message: str, exit_status: int = EPERM) -> None:
|
|
34
|
+
print(f'{NAME}: {error_message}', file=sys.stderr)
|
|
35
|
+
sys.exit(exit_status)
|
|
36
|
+
|
|
37
|
+
def warn(error_message: str) -> None:
|
|
38
|
+
print(f'{NAME}: {error_message}', file=sys.stderr)
|
dita/cleanup/xml.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
# Copyright (C) 2025 Jaromir Hradilek
|
|
2
|
+
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
# a copy of this software and associated documentation files (the "Soft-
|
|
7
|
+
# ware"), to deal in the Software without restriction, including without
|
|
8
|
+
# limitation the rights to use, copy, modify, merge, publish, distribute,
|
|
9
|
+
# sublicense, and/or sell copies of the Software, and to permit persons to
|
|
10
|
+
# whom the Software is furnished to do so, subject to the following condi-
|
|
11
|
+
# tions:
|
|
12
|
+
#
|
|
13
|
+
# The above copyright notice and this permission notice shall be included
|
|
14
|
+
# in all copies or substantial portions of the Software.
|
|
15
|
+
#
|
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
17
|
+
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABI-
|
|
18
|
+
# LITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
|
19
|
+
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
|
|
20
|
+
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
21
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
22
|
+
# OTHER DEALINGS IN THE SOFTWARE.
|
|
23
|
+
|
|
24
|
+
import re
|
|
25
|
+
from lxml import etree
|
|
26
|
+
from .out import warn
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
'list_ids', 'prune_ids', 'prune_includes', 'replace_attributes',
|
|
30
|
+
'update_image_paths', 'update_xref_targets'
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
def list_ids(xml: etree._ElementTree) -> list[str]:
|
|
34
|
+
result: list[str] = []
|
|
35
|
+
root = xml.getroot()
|
|
36
|
+
|
|
37
|
+
if root.tag not in ['concept', 'reference', 'task', 'topic']:
|
|
38
|
+
return result
|
|
39
|
+
|
|
40
|
+
if root.attrib.has_key('id'):
|
|
41
|
+
result.append(str(root.attrib['id']))
|
|
42
|
+
else:
|
|
43
|
+
result.append('')
|
|
44
|
+
|
|
45
|
+
for e in xml.iter():
|
|
46
|
+
if e == root:
|
|
47
|
+
continue
|
|
48
|
+
if not e.attrib.has_key('id'):
|
|
49
|
+
continue
|
|
50
|
+
if str(e.attrib['id']).startswith('_'):
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
result.append(str(e.attrib['id']))
|
|
54
|
+
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
def prune_ids(xml: etree._ElementTree) -> bool:
|
|
58
|
+
updated = False
|
|
59
|
+
|
|
60
|
+
adoc_attribute = re.compile(r'[_-]?\{([0-9A-Za-z_][0-9A-Za-z_-]*|set:.+?|counter2?:.+?)\}')
|
|
61
|
+
valid_id = re.compile(r'^[A-Za-z_:][A-Za-z0-9_:.-]+$')
|
|
62
|
+
|
|
63
|
+
for e in xml.iter():
|
|
64
|
+
if not e.attrib.has_key('id'):
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
xml_id = str(e.attrib['id'])
|
|
68
|
+
|
|
69
|
+
if valid_id.match(xml_id):
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
e.attrib['id'] = adoc_attribute.sub('', xml_id)
|
|
73
|
+
updated = True
|
|
74
|
+
|
|
75
|
+
return updated
|
|
76
|
+
|
|
77
|
+
def prune_includes(xml: etree._ElementTree) -> bool:
|
|
78
|
+
updated = False
|
|
79
|
+
|
|
80
|
+
for e in xml.iter():
|
|
81
|
+
if e.tag != 'xref':
|
|
82
|
+
continue
|
|
83
|
+
if not e.attrib.has_key('href'):
|
|
84
|
+
continue
|
|
85
|
+
if not str(e.attrib['href']).endswith('.adoc'):
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
parent = e.getparent()
|
|
89
|
+
|
|
90
|
+
if parent is None:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
parent.remove(e)
|
|
94
|
+
updated = True
|
|
95
|
+
|
|
96
|
+
if len(parent) != 0:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
grandparent = parent.getparent()
|
|
100
|
+
if grandparent is None:
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
grandparent.remove(parent)
|
|
104
|
+
|
|
105
|
+
return updated
|
|
106
|
+
|
|
107
|
+
def rebuild_text(text: str, conref_prefix: str) -> tuple[str, list[etree._Element]]:
|
|
108
|
+
adoc_attribute = re.compile(r'(?<!\$)\{([0-9A-Za-z_][0-9A-Za-z_-]*)\}')
|
|
109
|
+
|
|
110
|
+
rest = text
|
|
111
|
+
start = ''
|
|
112
|
+
nodes: list[etree._Element] = []
|
|
113
|
+
|
|
114
|
+
while match := adoc_attribute.findall(rest):
|
|
115
|
+
tail, rest = rest.split('{' + match[0] + '}', 1)
|
|
116
|
+
|
|
117
|
+
if not nodes:
|
|
118
|
+
start = tail
|
|
119
|
+
else:
|
|
120
|
+
nodes[-1].tail = tail
|
|
121
|
+
|
|
122
|
+
node = etree.Element('ph')
|
|
123
|
+
node.set('conref', conref_prefix + match[0].lower())
|
|
124
|
+
nodes.append(node)
|
|
125
|
+
|
|
126
|
+
if nodes:
|
|
127
|
+
nodes[-1].tail = rest
|
|
128
|
+
|
|
129
|
+
return start, nodes
|
|
130
|
+
|
|
131
|
+
def replace_attributes(xml: etree._ElementTree, conref_prefix: str) -> bool:
|
|
132
|
+
updated = False
|
|
133
|
+
|
|
134
|
+
if not conref_prefix.endswith('/'):
|
|
135
|
+
conref_prefix = conref_prefix + '/'
|
|
136
|
+
|
|
137
|
+
for e in xml.iter():
|
|
138
|
+
if e.text:
|
|
139
|
+
text, nodes = rebuild_text(str(e.text), conref_prefix)
|
|
140
|
+
|
|
141
|
+
if nodes:
|
|
142
|
+
e.text = text
|
|
143
|
+
|
|
144
|
+
index = 0
|
|
145
|
+
for node in nodes:
|
|
146
|
+
e.insert(index, node)
|
|
147
|
+
index += 1
|
|
148
|
+
|
|
149
|
+
updated = True
|
|
150
|
+
|
|
151
|
+
if e.tail:
|
|
152
|
+
text, nodes = rebuild_text(str(e.tail), conref_prefix)
|
|
153
|
+
|
|
154
|
+
if nodes:
|
|
155
|
+
e.tail = text
|
|
156
|
+
|
|
157
|
+
parent = e.getparent()
|
|
158
|
+
|
|
159
|
+
if parent is None:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
index = parent.index(e)
|
|
163
|
+
for node in nodes:
|
|
164
|
+
index += 1
|
|
165
|
+
parent.insert(index, node)
|
|
166
|
+
|
|
167
|
+
updated = True
|
|
168
|
+
|
|
169
|
+
return updated
|
|
170
|
+
|
|
171
|
+
def update_image_paths(xml: etree._ElementTree, images_dir: str) -> bool:
|
|
172
|
+
if images_dir == '':
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
updated = False
|
|
176
|
+
|
|
177
|
+
if not images_dir.endswith('/'):
|
|
178
|
+
images_dir = images_dir + '/'
|
|
179
|
+
|
|
180
|
+
for e in xml.iter():
|
|
181
|
+
if e.tag != 'image':
|
|
182
|
+
continue
|
|
183
|
+
if not e.attrib.has_key('href'):
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
e.attrib['href'] = images_dir + str(e.attrib['href'])
|
|
187
|
+
updated = True
|
|
188
|
+
|
|
189
|
+
return updated
|
|
190
|
+
|
|
191
|
+
def update_xref_targets(xml: etree._ElementTree, xml_ids: dict[str, list[str]], file_path: str) -> bool:
|
|
192
|
+
updated = False
|
|
193
|
+
|
|
194
|
+
for e in xml.iter():
|
|
195
|
+
if e.tag not in ['xref', 'link']:
|
|
196
|
+
continue
|
|
197
|
+
if e.attrib.has_key('scope') and e.attrib['scope'] == 'external':
|
|
198
|
+
continue
|
|
199
|
+
if not e.attrib.has_key('href'):
|
|
200
|
+
continue
|
|
201
|
+
if not str(e.attrib['href']).startswith('#'):
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
href = str(e.attrib['href']).lstrip('#')
|
|
205
|
+
match = [i for i in xml_ids.keys() if href == i or href.startswith(i + '_')]
|
|
206
|
+
|
|
207
|
+
if not match:
|
|
208
|
+
warn(file_path + ": No matching ID: " + href)
|
|
209
|
+
continue
|
|
210
|
+
if len(match) > 1:
|
|
211
|
+
warn(file_path + ": Multiple matching IDs: " + href)
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
target_id = match[0]
|
|
215
|
+
topic_id, target_file = xml_ids[target_id]
|
|
216
|
+
|
|
217
|
+
if topic_id == target_id:
|
|
218
|
+
e.attrib['href'] = target_file + '#' + topic_id
|
|
219
|
+
else:
|
|
220
|
+
e.attrib['href'] = target_file + '#' + topic_id + '/' + target_id
|
|
221
|
+
|
|
222
|
+
updated = True
|
|
223
|
+
|
|
224
|
+
return updated
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dita-cleanup
|
|
3
|
+
Version: 0.9.0
|
|
4
|
+
Summary: Clean up DITA topics after conversion from AsciiDoc.
|
|
5
|
+
Author-email: Jaromir Hradilek <jhradilek@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jhradilek/fix-dita-links
|
|
8
|
+
Project-URL: Repository, https://github.com/jhradilek/fix-dita-links
|
|
9
|
+
Project-URL: Issues, https://github.com/jhradilek/fix-dita-links/issues
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Documentation
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: lxml>=4.9.2
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# dita-cleanup
|
|
23
|
+
|
|
24
|
+
**dita-cleanup** is a command-line utility that allows you to clean up DITA topics after conversion from AsciiDoc.
|
|
25
|
+
|
|
26
|
+
In combination with [asciidoctor-dita-vale](https://github.com/jhradilek/asciidoctor-dita-vale), [asciidoctor-dita-topic](https://github.com/jhradilek/asciidoctor-dita-topic), and [dita-convert](https://github.com/jhradilek/dita-custom-xslt/), this project can be used to rapidly convert AsciiDoc content to DITA:
|
|
27
|
+
|
|
28
|
+
1. Identify incompatible markup in the AsciiDoc source file:
|
|
29
|
+
|
|
30
|
+
```console
|
|
31
|
+
vale source_file.adoc
|
|
32
|
+
```
|
|
33
|
+
2. Convert the AsciiDoc file to a generic DITA topic:
|
|
34
|
+
|
|
35
|
+
```console
|
|
36
|
+
asciidoctor -r dita-topic -b dita-topic -S secure source_file.adoc
|
|
37
|
+
```
|
|
38
|
+
3. Convert the generic DITA topic to a specialized DITA concept, reference, or task:
|
|
39
|
+
|
|
40
|
+
```console
|
|
41
|
+
dita-convert -g source_file.dita -o output_file.dita
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
4. Clean up the resulting DITA file:
|
|
45
|
+
|
|
46
|
+
```console
|
|
47
|
+
dita-cleanup -iI -D ../images -X . output_file.dita
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
Install the `dita-cleanup` Python package:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
python3 -m pip install --upgrade dita-cleanup
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Usage
|
|
59
|
+
|
|
60
|
+
* Remove unresolved [AsciiDoc attribute references](https://docs.asciidoctor.org/asciidoc/latest/attributes/reference-attributes/#reference-custom) from element IDs:
|
|
61
|
+
|
|
62
|
+
```console
|
|
63
|
+
dita-cleanup -i *.dita
|
|
64
|
+
```
|
|
65
|
+
* Remove unresolved [AsciiDoc include directives](https://docs.asciidoctor.org/asciidoc/latest/directives/include/):
|
|
66
|
+
|
|
67
|
+
```console
|
|
68
|
+
dita-cleanup -I *.dita
|
|
69
|
+
```
|
|
70
|
+
* Replace unresolved [AsciiDoc attribute references](https://docs.asciidoctor.org/asciidoc/latest/attributes/reference-attributes/#reference-custom) with reusable content references:
|
|
71
|
+
|
|
72
|
+
```console
|
|
73
|
+
dita-cleanup -C 'topic.dita#topic-id' *.dita
|
|
74
|
+
```
|
|
75
|
+
* Add a directory path to all image references:
|
|
76
|
+
|
|
77
|
+
```console
|
|
78
|
+
dita-cleanup -D ../images/ *.dita
|
|
79
|
+
```
|
|
80
|
+
* Update invlid cross references based on DITA files present in the supplied directory:
|
|
81
|
+
|
|
82
|
+
```console
|
|
83
|
+
dita-cleanup -X . *.dita
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
* Print the updated files to standard output instead of overwriting the supplied files:
|
|
87
|
+
|
|
88
|
+
```console
|
|
89
|
+
dita-cleanup -iI -D ../images/ -X . -o - *.dita
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
For a complete list of available command-line options, run `dita-cleanup` with the `-h` option:
|
|
93
|
+
|
|
94
|
+
```console
|
|
95
|
+
dita-cleanup -h
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Copyright
|
|
99
|
+
|
|
100
|
+
Copyright © 2025 Jaromir Hradilek
|
|
101
|
+
|
|
102
|
+
This program is free software, released under the terms of the MIT license. It is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
dita/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
dita/cleanup/__init__.py,sha256=4l8fIE3mmFXoagc17-6xl7jzN2vC4IhY7F_vsPeUA_s,1530
|
|
3
|
+
dita/cleanup/__main__.py,sha256=kSr9bYi-GVSCs6zFAYDnck2xT-a60M-yxovVHUBLnz8,1219
|
|
4
|
+
dita/cleanup/cli.py,sha256=ICBwcEWPeso53psKmZAaIAjMMdyM6QOY6n6_yIetyls,6530
|
|
5
|
+
dita/cleanup/out.py,sha256=dYrtBoIAipxgU4S4XCvqP8BpyNjDL6xsv4_AOLGDwRM,1499
|
|
6
|
+
dita/cleanup/xml.py,sha256=M0SzPFmH_JtoQZlBK-l9mJOHB7tlB3XU7A9N9tVdSO8,6207
|
|
7
|
+
dita_cleanup-0.9.0.dist-info/licenses/LICENSE,sha256=0qby_GlVkTnOlxGARsEXqHyBa-qGz8Wen9CpnylkBtc,1073
|
|
8
|
+
dita_cleanup-0.9.0.dist-info/METADATA,sha256=cAewo-BJhcpQcJHg6V3QB2r_u3BYZTajcQfe63ZUd9g,3408
|
|
9
|
+
dita_cleanup-0.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
10
|
+
dita_cleanup-0.9.0.dist-info/entry_points.txt,sha256=3gqSHvJ2B7ERJlHgIsGcjpAsYpQRyHmC_RXKwiTImwE,54
|
|
11
|
+
dita_cleanup-0.9.0.dist-info/top_level.txt,sha256=pcySPGjS3m2yMBIIBm9b-Zdh7qTNpybOV9OxOjETKas,5
|
|
12
|
+
dita_cleanup-0.9.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Copyright (C) 2025 Jaromir Hradilek
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
to deal in the Software without restriction, including without limitation
|
|
8
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dita
|