wipo-gbd-transformation 1.1.53__py3-none-any.whl → 1.1.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wipo-gbd-transformation might be problematic. Click here for more details.
- gbdtransformation/brands/catm/filters.py +2 -1
- gbdtransformation/brands/catm/template.yml +9 -5
- gbdtransformation/brands/chtm/filters.py +153 -181
- gbdtransformation/brands/chtm/schema +79 -91
- gbdtransformation/brands/chtm/template.yml +98 -107
- gbdtransformation/brands/natm/__init__.py +2 -3
- gbdtransformation/brands/natm/filters.py +8 -2
- gbdtransformation/brands/notm/__init__.py +5 -0
- gbdtransformation/brands/notm/filters.py +117 -0
- gbdtransformation/brands/notm/template.yml +165 -0
- gbdtransformation/brands/phtm/schema +79 -50
- gbdtransformation/brands/phtm/schema.classic +50 -0
- gbdtransformation/brands/phtm/template.classic.yml +102 -0
- gbdtransformation/brands/phtm/template.yml +1 -102
- gbdtransformation/brands/vctm/__init__.py +5 -0
- gbdtransformation/brands/vctm/filters.py +75 -0
- gbdtransformation/brands/vctm/schema +87 -0
- gbdtransformation/brands/vctm/template.yml +1 -0
- gbdtransformation/brands/vctm/tests/__init__.py +0 -0
- gbdtransformation/brands/xxxx/template.yml +1 -1
- gbdtransformation/designs/woid/filters.py +26 -1
- gbdtransformation/designs/woid/template.yml +166 -1
- gbdtransformation/designs/xxid/template.yml +118 -2
- gbdtransformation/execs-nico.py +709 -0
- gbdtransformation/execs.py +9 -4
- gbdtransformation/gbd-transform.exec.tgz +0 -0
- {wipo_gbd_transformation-1.1.53.dist-info → wipo_gbd_transformation-1.1.55.dist-info}/METADATA +5 -3
- {wipo_gbd_transformation-1.1.53.dist-info → wipo_gbd_transformation-1.1.55.dist-info}/RECORD +33 -24
- wipo_gbd_transformation-1.1.55.dist-info/SOURCES_Stefans-Mac-Studio.local_Sep-18-063455-2024_Conflict.txt +690 -0
- {wipo_gbd_transformation-1.1.53.dist-info → wipo_gbd_transformation-1.1.55.dist-info}/WHEEL +1 -1
- gbdtransformation/designs/bgid/__init__.py +0 -5
- gbdtransformation/designs/bgid/filters.py +0 -91
- gbdtransformation/designs/bgid/schema +0 -106
- gbdtransformation/designs/bgid/template.yml +0 -169
- {wipo_gbd_transformation-1.1.53.dist-info → wipo_gbd_transformation-1.1.55.dist-info}/LICENSE.md +0 -0
- {wipo_gbd_transformation-1.1.53.dist-info → wipo_gbd_transformation-1.1.55.dist-info}/entry_points.txt +0 -0
- {wipo_gbd_transformation-1.1.53.dist-info → wipo_gbd_transformation-1.1.55.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,709 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import time
|
|
3
|
+
import argparse
|
|
4
|
+
import random
|
|
5
|
+
import os
|
|
6
|
+
import traceback
|
|
7
|
+
import difflib
|
|
8
|
+
import gzip
|
|
9
|
+
import multiprocessing
|
|
10
|
+
import xml.etree.ElementTree as ET
|
|
11
|
+
import concurrent.futures
|
|
12
|
+
import pprint
|
|
13
|
+
|
|
14
|
+
from tabulate import tabulate
|
|
15
|
+
from gbdtransformation.parser import Parser
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def build_command_parser(options, doc):
|
|
19
|
+
"""Argparse builder
|
|
20
|
+
@param options: the dict of config options
|
|
21
|
+
@pram doc: the helper for the command
|
|
22
|
+
return parsed args"""
|
|
23
|
+
parser = argparse.ArgumentParser(description=doc,
|
|
24
|
+
formatter_class=argparse.RawTextHelpFormatter)
|
|
25
|
+
for config in options:
|
|
26
|
+
name = config.pop('name')
|
|
27
|
+
parser.add_argument(*name, **config)
|
|
28
|
+
return parser.parse_args()
|
|
29
|
+
|
|
30
|
+
parsers = {}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class bcolors:
|
|
34
|
+
HEADER = '\033[95m'
|
|
35
|
+
OKBLUE = '\033[94m'
|
|
36
|
+
OKGREEN = '\033[92m'
|
|
37
|
+
INFO = '\033[92m'
|
|
38
|
+
WARNING = '\033[93m'
|
|
39
|
+
FAIL = '\033[91m'
|
|
40
|
+
CRITICAL = '\033[91m'
|
|
41
|
+
ERROR = '\033[91m'
|
|
42
|
+
ENDC = '\033[0m'
|
|
43
|
+
BOLD = '\033[1m'
|
|
44
|
+
UNDERLINE = '\033[4m'
|
|
45
|
+
|
|
46
|
+
class progress:
|
|
47
|
+
def __init__(self, total):
|
|
48
|
+
self.total = total
|
|
49
|
+
self.done = 0
|
|
50
|
+
|
|
51
|
+
def start(self):
|
|
52
|
+
printProgressBar(0, self.total,
|
|
53
|
+
prefix='Progress:', suffix='Complete', length=50)
|
|
54
|
+
|
|
55
|
+
def advance(self, value):
|
|
56
|
+
self.done = value
|
|
57
|
+
printProgressBar(self.done, self.total,
|
|
58
|
+
prefix='Progress:', suffix='Complete', length=50)
|
|
59
|
+
|
|
60
|
+
def advance_with_step(self, value):
|
|
61
|
+
self.done += value
|
|
62
|
+
printProgressBar(self.done, self.total,
|
|
63
|
+
prefix='Progress:', suffix='Complete', length=50)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test():
|
|
67
|
+
doc = """
|
|
68
|
+
Runs regression tests
|
|
69
|
+
"""
|
|
70
|
+
configs = [{
|
|
71
|
+
'name': ['--junit'],
|
|
72
|
+
'dest': 'junit',
|
|
73
|
+
'help': 'saves in junit format',
|
|
74
|
+
'action': 'store_true',
|
|
75
|
+
'default': False
|
|
76
|
+
}]
|
|
77
|
+
args = build_command_parser(configs, doc)
|
|
78
|
+
pkg_folder = os.path.dirname(__file__)
|
|
79
|
+
test_to_run = []
|
|
80
|
+
for type in ['brands', 'designs']:
|
|
81
|
+
path = os.path.join(pkg_folder, type)
|
|
82
|
+
for root, dirs, files in os.walk(path):
|
|
83
|
+
if 'tests' in dirs:
|
|
84
|
+
template = os.path.basename(root)
|
|
85
|
+
for file in os.listdir(os.path.join(root, 'tests')):
|
|
86
|
+
if file.startswith('_'):
|
|
87
|
+
continue
|
|
88
|
+
if file.endswith('.out'):
|
|
89
|
+
continue
|
|
90
|
+
filename, ext = os.path.splitext(file)
|
|
91
|
+
input_file_path = os.path.join(root, 'tests', file)
|
|
92
|
+
out_file_path = input_file_path.replace(ext, '.out')
|
|
93
|
+
has_output = os.path.exists(out_file_path)
|
|
94
|
+
test_to_run.append({
|
|
95
|
+
'template': template,
|
|
96
|
+
'path': input_file_path,
|
|
97
|
+
'test_output': has_output,
|
|
98
|
+
'invalid_output': None
|
|
99
|
+
})
|
|
100
|
+
for test in test_to_run:
|
|
101
|
+
res, exceptions, error = _run_per_file(
|
|
102
|
+
test['template'], test['path'])
|
|
103
|
+
test['execution'] = res
|
|
104
|
+
test['errors'] = exceptions
|
|
105
|
+
filename, ext = os.path.splitext(test['path'])
|
|
106
|
+
if test['test_output']:
|
|
107
|
+
expected = ''
|
|
108
|
+
with open(test['path'].replace(ext, '.out'), 'r') as f:
|
|
109
|
+
expected = [e.replace('\n', '') for e in f.readlines()]
|
|
110
|
+
delta = difflib.ndiff(expected, res.split('\n'))
|
|
111
|
+
to_outup_diffs = []
|
|
112
|
+
for d in delta:
|
|
113
|
+
if d[0] != ' ':
|
|
114
|
+
to_outup_diffs.append(d)
|
|
115
|
+
else:
|
|
116
|
+
if to_outup_diffs:
|
|
117
|
+
break
|
|
118
|
+
to_outup_diffs = []
|
|
119
|
+
test['invalid_output'] = '\n'.join(to_outup_diffs)
|
|
120
|
+
display = [
|
|
121
|
+
['Nb.', 'Template', 'Input', 'Has run?', 'Errors', 'Valid output']
|
|
122
|
+
]
|
|
123
|
+
counter = 0
|
|
124
|
+
if args.junit:
|
|
125
|
+
total = 0
|
|
126
|
+
errors = 0
|
|
127
|
+
fail = 0
|
|
128
|
+
tests_run_xml = []
|
|
129
|
+
for test in test_to_run:
|
|
130
|
+
if test['errors']:
|
|
131
|
+
tmp = '''<failure type="Conversion error">
|
|
132
|
+
%s
|
|
133
|
+
</failure>''' % test['errors']
|
|
134
|
+
elif test['test_output'] and test['invalid_output']:
|
|
135
|
+
tmp = '''<failure type="Invalid output">
|
|
136
|
+
%s
|
|
137
|
+
</failure>''' % test['invalid_output']
|
|
138
|
+
else:
|
|
139
|
+
tmp = ''
|
|
140
|
+
current = '''
|
|
141
|
+
<testcase classname="%s" name="%s" time="0.001">
|
|
142
|
+
%s
|
|
143
|
+
</testcase>''' % (test['path'], test['template'], tmp)
|
|
144
|
+
total += 1
|
|
145
|
+
if test['errors']:
|
|
146
|
+
errors += 1
|
|
147
|
+
elif test['test_output']:
|
|
148
|
+
if test['invalid_output']:
|
|
149
|
+
fail += 1
|
|
150
|
+
tests_run_xml.append(current)
|
|
151
|
+
payload = '\n'.join(tests_run_xml)
|
|
152
|
+
xml = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
153
|
+
<testsuite name="integration" tests="%s" errors="%s" failures="%s" skip="0">
|
|
154
|
+
%s
|
|
155
|
+
</testsuite>''' % (total, errors, fail, payload)
|
|
156
|
+
with open('tests.xml', 'w') as f:
|
|
157
|
+
f.write(xml)
|
|
158
|
+
for test in test_to_run:
|
|
159
|
+
counter += 1
|
|
160
|
+
has_run = u'\u2713'
|
|
161
|
+
color = ''
|
|
162
|
+
end_color = ''
|
|
163
|
+
valid_output = "No output to test"
|
|
164
|
+
if test['test_output']:
|
|
165
|
+
valid_output = u'\u2713'
|
|
166
|
+
if test['invalid_output']:
|
|
167
|
+
valid_output = test['invalid_output']
|
|
168
|
+
color = bcolors.WARNING
|
|
169
|
+
end_color = bcolors.ENDC
|
|
170
|
+
if test['errors']:
|
|
171
|
+
valid_output = u'\u2717'
|
|
172
|
+
has_run = u'\u2717'
|
|
173
|
+
color = bcolors.FAIL
|
|
174
|
+
end_color = bcolors.ENDC
|
|
175
|
+
test['errors'] = '\n'.join(['%s%s%s' % (color, e, end_color)
|
|
176
|
+
for e in test['errors'].split('\n')])
|
|
177
|
+
display.append([
|
|
178
|
+
'%s%s' % (color, counter), test['template'], os.path.basename(test['path']),
|
|
179
|
+
has_run, test['errors'], '%s%s%s' % (color,
|
|
180
|
+
valid_output, end_color)])
|
|
181
|
+
print(tabulate(display[1:], headers=display[0]))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _run_per_file(template, path, input_string=None, validate=False):
|
|
185
|
+
parser = Parser(template)
|
|
186
|
+
if input_string:
|
|
187
|
+
data = input_string
|
|
188
|
+
else:
|
|
189
|
+
data = path
|
|
190
|
+
try:
|
|
191
|
+
transformed = parser.run(data, raise_errors=True)
|
|
192
|
+
if validate:
|
|
193
|
+
transformed, errors = parser.validate(transformed, gbd_format=transformed)
|
|
194
|
+
return (transformed, None, errors)
|
|
195
|
+
return (transformed, None, None)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
return (None, traceback.format_exc(), None)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def printProgressBar(iteration, total, prefix = '', suffix = '',
|
|
201
|
+
decimals=1, length = 100, fill = '█', printEnd = "\r"):
|
|
202
|
+
"""
|
|
203
|
+
Call in a loop to create terminal progress bar
|
|
204
|
+
@params:
|
|
205
|
+
iteration - Required : current iteration (Int)
|
|
206
|
+
total - Required : total iterations (Int)
|
|
207
|
+
prefix - Optional : prefix string (Str)
|
|
208
|
+
suffix - Optional : suffix string (Str)
|
|
209
|
+
decimals - Optional : positive number of decimals in percent complete (Int)
|
|
210
|
+
length - Optional : character length of bar (Int)
|
|
211
|
+
fill - Optional : bar fill character (Str)
|
|
212
|
+
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
|
|
213
|
+
"""
|
|
214
|
+
return
|
|
215
|
+
# percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
|
|
216
|
+
# filledLength = int(length * iteration // total)
|
|
217
|
+
# bar = fill * filledLength + '-' * (length - filledLength)
|
|
218
|
+
# print(f'\r{prefix} |{bar}| {percent}% {suffix}', end=printEnd)
|
|
219
|
+
# Print New Line on Complete
|
|
220
|
+
# if iteration == total:
|
|
221
|
+
# print()
|
|
222
|
+
|
|
223
|
+
def do_transform(file, templates, validate=False):
|
|
224
|
+
raw_data = __read_file(file)
|
|
225
|
+
|
|
226
|
+
for template in templates.split(','):
|
|
227
|
+
parser = parsers.get(template)
|
|
228
|
+
ret = {'src': file}
|
|
229
|
+
ret['fmt'] = 'gbd'
|
|
230
|
+
# get return from transformation
|
|
231
|
+
if template == 'solrjtm':
|
|
232
|
+
ret['fmt'] = 'idx'
|
|
233
|
+
try:
|
|
234
|
+
transformed = parser.run(raw_data, raise_errors=True)
|
|
235
|
+
ret['out'] = transformed
|
|
236
|
+
except Exception as e:
|
|
237
|
+
ret['terror'] = {'message': e, 'stacktrace': traceback.format_exc()}
|
|
238
|
+
else:
|
|
239
|
+
|
|
240
|
+
if not validate or template == 'solrjtm':
|
|
241
|
+
try:
|
|
242
|
+
transformed = parser.run(raw_data, raise_errors=True)
|
|
243
|
+
ret['out'] = transformed
|
|
244
|
+
raw_data = transformed
|
|
245
|
+
except Exception as e:
|
|
246
|
+
ret['terror'] = {'message': e, 'stacktrace': traceback.format_exc()}
|
|
247
|
+
# get return from transformation and validation
|
|
248
|
+
else:
|
|
249
|
+
try:
|
|
250
|
+
transformed, errors = parser.validate(raw_data)
|
|
251
|
+
ret['out'] = transformed
|
|
252
|
+
ret['verrors'] = errors
|
|
253
|
+
raw_data = transformed
|
|
254
|
+
except Exception as e:
|
|
255
|
+
ret['terror'] = {'message': e, 'stacktrace': traceback.format_exc()}
|
|
256
|
+
return ret
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _sub_arry_offset(max_paralel, length, offset):
|
|
260
|
+
if offset + max_paralel < length:
|
|
261
|
+
return offset + max_paralel
|
|
262
|
+
return length
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _paralel_process(path, xpath_lines):
|
|
266
|
+
|
|
267
|
+
max_parallel = 25
|
|
268
|
+
# Schedule an initial scan for each segment of the table. We read each
|
|
269
|
+
# segment in a separate thread, then look to see if there are more rows to
|
|
270
|
+
# read -- and if so, we schedule another scan.
|
|
271
|
+
tasks_to_do = []
|
|
272
|
+
for root, dirs, files in os.walk(path):
|
|
273
|
+
for f in files:
|
|
274
|
+
# TODO: match file name with regex
|
|
275
|
+
if f.endswith('.xml.gz'):
|
|
276
|
+
file2process = os.path.join(path, root, f)
|
|
277
|
+
tasks_to_do.append(file2process)
|
|
278
|
+
pbar = progress(len(tasks_to_do))
|
|
279
|
+
|
|
280
|
+
task_counter = 0
|
|
281
|
+
# Make the list an iterator, so the same tasks don't get run repeatedly.
|
|
282
|
+
|
|
283
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
284
|
+
|
|
285
|
+
# Schedule the initial batch of futures. Here we assume that
|
|
286
|
+
# max_scans_in_parallel < total_segments, so there's no risk that
|
|
287
|
+
# the queue will throw an Empty exception.
|
|
288
|
+
futures = {
|
|
289
|
+
executor.submit(_analyse_for_shazam, file2process, xpath_lines): file2process
|
|
290
|
+
for file2process in tasks_to_do[task_counter:_sub_arry_offset(max_parallel,
|
|
291
|
+
len(tasks_to_do),
|
|
292
|
+
task_counter)]
|
|
293
|
+
}
|
|
294
|
+
pbar.start()
|
|
295
|
+
task_counter = len(futures)
|
|
296
|
+
while futures:
|
|
297
|
+
# Wait for the first future to complete.
|
|
298
|
+
done, _ = concurrent.futures.wait(
|
|
299
|
+
futures, return_when=concurrent.futures.FIRST_COMPLETED
|
|
300
|
+
)
|
|
301
|
+
pbar.advance_with_step(len(done))
|
|
302
|
+
for fut in done:
|
|
303
|
+
res = fut.result()
|
|
304
|
+
file2process = futures.pop(fut)
|
|
305
|
+
yield xpath_lines
|
|
306
|
+
|
|
307
|
+
# Schedule the next batch of futures. At some point we might run out
|
|
308
|
+
# of entries in the queue if we've finished scanning the table, so
|
|
309
|
+
# we need to spot that and not throw.
|
|
310
|
+
for file2process in tasks_to_do[task_counter:_sub_arry_offset(len(done),
|
|
311
|
+
len(tasks_to_do),
|
|
312
|
+
task_counter)]:
|
|
313
|
+
task_counter += 1
|
|
314
|
+
futures[executor.submit(_analyse_for_shazam, file2process, xpath_lines)] = file2process
|
|
315
|
+
|
|
316
|
+
def _doc2xpath(el, path, lines, root=''):
|
|
317
|
+
lines.add(root + path)
|
|
318
|
+
path = root + path
|
|
319
|
+
# Print attributes
|
|
320
|
+
for name, val in el.items() :
|
|
321
|
+
lines.add(path + "[@" + _removeNS(name) + "=" + val+"]")
|
|
322
|
+
# Counter on the sibbling element names
|
|
323
|
+
counters = {}
|
|
324
|
+
# Loop on child elements
|
|
325
|
+
for childEl in el :
|
|
326
|
+
tag = _removeNS(childEl.tag)
|
|
327
|
+
# Tag name already encountered ?
|
|
328
|
+
if tag in counters:
|
|
329
|
+
continue
|
|
330
|
+
counters[tag] = 1
|
|
331
|
+
# Print child node recursively
|
|
332
|
+
_doc2xpath(childEl, '/' + tag, lines, root=path)
|
|
333
|
+
|
|
334
|
+
def _removeNS(tag) :
|
|
335
|
+
if tag.find('}') == -1 :
|
|
336
|
+
return tag
|
|
337
|
+
else:
|
|
338
|
+
return tag.split('}', 1)[1]
|
|
339
|
+
|
|
340
|
+
def _analyse_for_shazam(file2process, xpath_lines):
|
|
341
|
+
stream = __read_file(file2process)
|
|
342
|
+
tree = ET.ElementTree(ET.fromstring(stream))
|
|
343
|
+
troot = tree.getroot()
|
|
344
|
+
_doc2xpath(troot, _removeNS(troot.tag), xpath_lines)
|
|
345
|
+
|
|
346
|
+
def shazam():
|
|
347
|
+
doc = """
|
|
348
|
+
deduce xpath lines from a directory of xml files
|
|
349
|
+
"""
|
|
350
|
+
configs = [{
|
|
351
|
+
'name': ['path'],
|
|
352
|
+
'type': str,
|
|
353
|
+
'help': 'path to a file or a directory'
|
|
354
|
+
}, {
|
|
355
|
+
'name': ['-o'],
|
|
356
|
+
'dest': 'outfile',
|
|
357
|
+
'help': 'write output to a file',
|
|
358
|
+
'type': str,
|
|
359
|
+
'default': None,
|
|
360
|
+
}, ]
|
|
361
|
+
|
|
362
|
+
args = build_command_parser(configs, doc)
|
|
363
|
+
path = args.path
|
|
364
|
+
|
|
365
|
+
if os.path.isfile(path):
|
|
366
|
+
print('Expected a directory location.')
|
|
367
|
+
sys.exit(1)
|
|
368
|
+
# a set to contain the unique xpath lines
|
|
369
|
+
xpath_lines = set()
|
|
370
|
+
|
|
371
|
+
# in case the path passed is relative
|
|
372
|
+
if not os.path.isabs(path):
|
|
373
|
+
path = os.path.realpath(os.path.join(os.getcwd(), path))
|
|
374
|
+
# passed a directory
|
|
375
|
+
current_xplath_lines = None
|
|
376
|
+
for tmp in _paralel_process(path, xpath_lines):
|
|
377
|
+
current_xplath_lines = tmp
|
|
378
|
+
|
|
379
|
+
xpath_lines = current_xplath_lines
|
|
380
|
+
if(args.outfile):
|
|
381
|
+
with open(args.outfile, 'w') as fh:
|
|
382
|
+
for line in sorted(xpath_lines):
|
|
383
|
+
xpath = line.split('/')
|
|
384
|
+
leaf = xpath.pop()
|
|
385
|
+
fh.write(''.join(['__' for p in xpath]) + '/'+leaf)
|
|
386
|
+
fh.write('\n')
|
|
387
|
+
else:
|
|
388
|
+
pprint.pprint(xpath_lines)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _paralel_run(tasks_to_do, templates, pbar, validate=False, max_parallel=25):
|
|
394
|
+
# Schedule an initial scan for each segment of the table. We read each
|
|
395
|
+
# segment in a separate thread, then look to see if there are more rows to
|
|
396
|
+
# read -- and if so, we schedule another scan.
|
|
397
|
+
|
|
398
|
+
task_counter = 0
|
|
399
|
+
# Make the list an iterator, so the same tasks don't get run repeatedly.
|
|
400
|
+
|
|
401
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
402
|
+
|
|
403
|
+
# Schedule the initial batch of futures. Here we assume that
|
|
404
|
+
# max_scans_in_parallel < total_segments, so there's no risk that
|
|
405
|
+
# the queue will throw an Empty exception.
|
|
406
|
+
futures = {
|
|
407
|
+
executor.submit(do_transform, file2process, templates, validate): file2process
|
|
408
|
+
for file2process in tasks_to_do[task_counter:_sub_arry_offset(max_parallel,
|
|
409
|
+
len(tasks_to_do),
|
|
410
|
+
task_counter)]
|
|
411
|
+
}
|
|
412
|
+
task_counter = len(futures)
|
|
413
|
+
while futures:
|
|
414
|
+
# Wait for the first future to complete.
|
|
415
|
+
processed, _ = concurrent.futures.wait(
|
|
416
|
+
futures, return_when=concurrent.futures.FIRST_COMPLETED
|
|
417
|
+
)
|
|
418
|
+
pbar.advance_with_step(len(processed))
|
|
419
|
+
for fut in processed:
|
|
420
|
+
res = fut.result()
|
|
421
|
+
file2process = futures.pop(fut)
|
|
422
|
+
yield res
|
|
423
|
+
|
|
424
|
+
# Schedule the next batch of futures. At some point we might run out
|
|
425
|
+
# of entries in the queue if we've finished scanning the table, so
|
|
426
|
+
# we need to spot that and not throw.
|
|
427
|
+
for file2process in tasks_to_do[task_counter:_sub_arry_offset(len(processed),
|
|
428
|
+
len(tasks_to_do),
|
|
429
|
+
task_counter)]:
|
|
430
|
+
task_counter += 1
|
|
431
|
+
futures[executor.submit(do_transform, file2process, templates, validate)] = file2process
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def do_multiprocess(files, settings):
|
|
435
|
+
(args, pbar, done) = settings
|
|
436
|
+
|
|
437
|
+
results = []
|
|
438
|
+
# create parsers
|
|
439
|
+
for template in args.template.split(','):
|
|
440
|
+
parsers[template] = Parser(template)
|
|
441
|
+
|
|
442
|
+
for file in files:
|
|
443
|
+
results.append(do_transform(file, args.template, validate=args.validate))
|
|
444
|
+
done.value += 1
|
|
445
|
+
pbar.advance(done.value)
|
|
446
|
+
# for tmp in _paralel_run(files, args.template, pbar, validate=args.validate,
|
|
447
|
+
# max_parallel=args.threads):
|
|
448
|
+
# results.append(tmp)
|
|
449
|
+
return results
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def run():
|
|
453
|
+
doc = """
|
|
454
|
+
transform input to output using a defined template name.
|
|
455
|
+
"""
|
|
456
|
+
configs = [{
|
|
457
|
+
'name': ['path'],
|
|
458
|
+
'type': str,
|
|
459
|
+
'help': 'path to a file or a directory'
|
|
460
|
+
}, {
|
|
461
|
+
'name': ['template'],
|
|
462
|
+
'type': str,
|
|
463
|
+
'help': 'the template used for transformation'
|
|
464
|
+
}, {
|
|
465
|
+
'name': ['-t'],
|
|
466
|
+
'dest': 'top',
|
|
467
|
+
'type': int,
|
|
468
|
+
'help': 'number of files to run the command onto',
|
|
469
|
+
'default': 0
|
|
470
|
+
}, {
|
|
471
|
+
'name': ['-r'],
|
|
472
|
+
'dest': 'random',
|
|
473
|
+
'type': int,
|
|
474
|
+
'help': 'number of *random* files to run the command onto',
|
|
475
|
+
'default': 0
|
|
476
|
+
}, {
|
|
477
|
+
'name': ['-w'],
|
|
478
|
+
'dest': 'workers',
|
|
479
|
+
'type': int,
|
|
480
|
+
'help': 'number of workers to run the command',
|
|
481
|
+
'default': 1
|
|
482
|
+
},{
|
|
483
|
+
'name': ['-th'],
|
|
484
|
+
'dest': 'threads',
|
|
485
|
+
'type': int,
|
|
486
|
+
'help': 'number of threads to run the command',
|
|
487
|
+
'default': 25
|
|
488
|
+
},{
|
|
489
|
+
'name': ['-o'],
|
|
490
|
+
'dest': 'outfile',
|
|
491
|
+
'help': 'write output to a file',
|
|
492
|
+
'type': str,
|
|
493
|
+
'default': None,
|
|
494
|
+
}, {
|
|
495
|
+
'name': ['-a'],
|
|
496
|
+
'dest': 'appendfile',
|
|
497
|
+
'help': 'append output to a file',
|
|
498
|
+
'type': str,
|
|
499
|
+
'default': None,
|
|
500
|
+
}, {
|
|
501
|
+
'name': ['--qc'],
|
|
502
|
+
'dest': 'validate',
|
|
503
|
+
'help': 'runs gbd-validate on output',
|
|
504
|
+
'action': 'store_true',
|
|
505
|
+
'default': False
|
|
506
|
+
}, {
|
|
507
|
+
'name': ['-q', '--quiet'],
|
|
508
|
+
'dest': 'quiet',
|
|
509
|
+
'help': 'perform transformation quietly (do not print result of transformation)',
|
|
510
|
+
'action': 'store_true',
|
|
511
|
+
'default': False
|
|
512
|
+
}, ]
|
|
513
|
+
args = build_command_parser(configs, doc)
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def _walk_dir(root_path, nb):
|
|
517
|
+
buffer = []
|
|
518
|
+
for root, dirs, files in os.walk(root_path):
|
|
519
|
+
for f in files:
|
|
520
|
+
if f.endswith('.xml.gz'): # or f.endswith('.xml'):
|
|
521
|
+
buffer.append(os.path.join(root_path, root, f))
|
|
522
|
+
if len(buffer) == nb:
|
|
523
|
+
return buffer
|
|
524
|
+
return buffer
|
|
525
|
+
|
|
526
|
+
def _fish_dir(root_path, nb):
|
|
527
|
+
buffer = []
|
|
528
|
+
path = root_path
|
|
529
|
+
# go fishing
|
|
530
|
+
while len(buffer) < nb:
|
|
531
|
+
sea = os.listdir(path)
|
|
532
|
+
# skip empty directories
|
|
533
|
+
if not len(sea):
|
|
534
|
+
path = root_path
|
|
535
|
+
continue
|
|
536
|
+
fish = os.path.join(path, random.choice(sea))
|
|
537
|
+
if os.path.isdir(fish):
|
|
538
|
+
path = fish
|
|
539
|
+
elif os.path.isfile(fish) and fish.endswith('.xml.gz'):
|
|
540
|
+
buffer.append(fish)
|
|
541
|
+
path = root_path
|
|
542
|
+
return buffer
|
|
543
|
+
|
|
544
|
+
path = args.path
|
|
545
|
+
# in case the path passed is relative
|
|
546
|
+
if not os.path.isabs(path):
|
|
547
|
+
path = os.path.realpath(os.path.join(os.getcwd(), path))
|
|
548
|
+
|
|
549
|
+
files = []
|
|
550
|
+
# passed a file
|
|
551
|
+
if os.path.isfile(path):
|
|
552
|
+
files.append(path)
|
|
553
|
+
# passed a directory
|
|
554
|
+
elif os.path.isdir(path):
|
|
555
|
+
if args.random:
|
|
556
|
+
files = _fish_dir(path, args.random)
|
|
557
|
+
else:
|
|
558
|
+
files = _walk_dir(path, args.top)
|
|
559
|
+
else:
|
|
560
|
+
raise Exception('invalid path %s. try again.' % path)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
workers = min(multiprocessing.cpu_count() - 4, args.workers)
|
|
565
|
+
|
|
566
|
+
# print('Running template [%s] * [%s files] with [%s workers]' % (args.template,
|
|
567
|
+
# len(files), workers))
|
|
568
|
+
files_per_worker_len = len(files) / workers
|
|
569
|
+
|
|
570
|
+
files_per_worker = []
|
|
571
|
+
tmp = []
|
|
572
|
+
for el in files:
|
|
573
|
+
if len(tmp) >= files_per_worker_len:
|
|
574
|
+
files_per_worker.append(tmp)
|
|
575
|
+
tmp = []
|
|
576
|
+
tmp.append(el)
|
|
577
|
+
files_per_worker.append(tmp)
|
|
578
|
+
|
|
579
|
+
pbar = progress(len(files))
|
|
580
|
+
pbar.start()
|
|
581
|
+
|
|
582
|
+
# a way to share state among workers
|
|
583
|
+
mpmanager = multiprocessing.Manager()
|
|
584
|
+
done = mpmanager.Value('i', 0)
|
|
585
|
+
|
|
586
|
+
with multiprocessing.Pool(processes=workers) as pool: # auto closing workers
|
|
587
|
+
raw_results = pool.starmap(do_multiprocess, zip(files_per_worker, [(args, pbar, done) for x in files]))
|
|
588
|
+
results = []
|
|
589
|
+
for result in raw_results:
|
|
590
|
+
results.extend(result)
|
|
591
|
+
|
|
592
|
+
_print_transformation_out(results, args)
|
|
593
|
+
_print_transformation_err(results, args)
|
|
594
|
+
_print_validation_err(results, args)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def _print_transformation_out(results, args):
|
|
599
|
+
output_storage = args.outfile or args.appendfile or '/dev/null'
|
|
600
|
+
output_mode = 'a' if args.appendfile else 'w'
|
|
601
|
+
# fh = open(output_storage, output_mode)
|
|
602
|
+
|
|
603
|
+
dirFiles = {}
|
|
604
|
+
for r in results:
|
|
605
|
+
if r.get('out', None):
|
|
606
|
+
dir = os.path.dirname(r.get('src'))
|
|
607
|
+
parentDir = os.path.dirname(dir)
|
|
608
|
+
# print("dir: ", dir, " parent " , parentDir)
|
|
609
|
+
destFile = os.path.join(parentDir, r.get('fmt')+".json")
|
|
610
|
+
if destFile not in dirFiles:
|
|
611
|
+
dirFiles[parentDir] = destFile
|
|
612
|
+
# no support for append
|
|
613
|
+
dirHandles = {}
|
|
614
|
+
for dir in dirFiles:
|
|
615
|
+
dirHandles[dir] = open(dirFiles[dir],'w')
|
|
616
|
+
print("Creating this file: ", dirFiles[dir])
|
|
617
|
+
dirHandles[dir].write("[\n")
|
|
618
|
+
|
|
619
|
+
for result in results:
|
|
620
|
+
if result.get('out', None):
|
|
621
|
+
if not args.quiet:
|
|
622
|
+
print(result['out'])
|
|
623
|
+
childDir = os.path.dirname(result.get('src'))
|
|
624
|
+
dir = os.path.dirname(childDir)
|
|
625
|
+
#dirHandles[dir]#with open(dirFiles.get(dir), 'a') as df:
|
|
626
|
+
dirHandles[dir].write(result['out'])
|
|
627
|
+
dirHandles[dir].write(",\n")
|
|
628
|
+
# fh.write(result['out'])
|
|
629
|
+
# fh.write('\n')
|
|
630
|
+
for dh in dirHandles.values():
|
|
631
|
+
dh.write("{}]\n")
|
|
632
|
+
dh.close()
|
|
633
|
+
|
|
634
|
+
# fh.close()
|
|
635
|
+
|
|
636
|
+
def _print_validation_err(results, args):
|
|
637
|
+
if not args.validate:
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
display_lines = []
|
|
641
|
+
|
|
642
|
+
for result in results:
|
|
643
|
+
verrors = result.get('verrors', [])
|
|
644
|
+
if not len(verrors):
|
|
645
|
+
continue
|
|
646
|
+
|
|
647
|
+
display_line = {}
|
|
648
|
+
display_line['QC Invalid File'] = __format_color(result['src'], bcolors.FAIL)
|
|
649
|
+
display_line['Severity'] = []
|
|
650
|
+
display_line['Field'] = []
|
|
651
|
+
display_line['Message'] = []
|
|
652
|
+
|
|
653
|
+
for i, verror in enumerate(verrors):
|
|
654
|
+
severity = __format_color(verror['severity'], getattr(bcolors, verror['severity']))
|
|
655
|
+
field = verror['field']
|
|
656
|
+
message = verror['type']
|
|
657
|
+
|
|
658
|
+
display_line['Severity'].append(severity)
|
|
659
|
+
display_line['Field'].append(field)
|
|
660
|
+
display_line['Message'].append(message)
|
|
661
|
+
|
|
662
|
+
display_line['Severity'] = '\n'.join(display_line['Severity'])
|
|
663
|
+
display_line['Field'] = '\n'.join(display_line['Field'])
|
|
664
|
+
display_line['Message'] = '\n'.join(display_line['Message'])
|
|
665
|
+
display_lines.append(display_line)
|
|
666
|
+
|
|
667
|
+
if len(display_lines):
|
|
668
|
+
print('\n')
|
|
669
|
+
print(tabulate(display_lines, headers='keys', showindex='always', tablefmt='psql'))
|
|
670
|
+
|
|
671
|
+
def _print_transformation_err(results, args):
|
|
672
|
+
# a single file
|
|
673
|
+
if(len(results) == 1):
|
|
674
|
+
result = results[0]
|
|
675
|
+
if result.get('terror', None):
|
|
676
|
+
print(__format_color(result['terror']['stacktrace'], bcolors.FAIL))
|
|
677
|
+
return
|
|
678
|
+
|
|
679
|
+
# multi file
|
|
680
|
+
display_lines = []
|
|
681
|
+
|
|
682
|
+
for result in results:
|
|
683
|
+
if not result.get('terror', None):
|
|
684
|
+
continue
|
|
685
|
+
|
|
686
|
+
display_line = {}
|
|
687
|
+
display_line['Transformation Failed File'] = __format_color(result['src'], bcolors.FAIL)
|
|
688
|
+
display_line['Error Message'] = result['terror']['message']
|
|
689
|
+
|
|
690
|
+
display_lines.append(display_line)
|
|
691
|
+
|
|
692
|
+
if len(display_lines):
|
|
693
|
+
print('\n')
|
|
694
|
+
print(tabulate(display_lines, headers='keys', showindex='always', tablefmt='psql'))
|
|
695
|
+
|
|
696
|
+
def __format_color(value, color):
|
|
697
|
+
return '%s%s%s' % (color, value, bcolors.ENDC)
|
|
698
|
+
|
|
699
|
+
def __read_file(file):
|
|
700
|
+
if file.endswith('.xml.gz'):
|
|
701
|
+
with gzip.open(file, 'rb') as f:
|
|
702
|
+
raw_data = f.read()
|
|
703
|
+
return raw_data
|
|
704
|
+
else:
|
|
705
|
+
with open(file, 'r') as f:
|
|
706
|
+
raw_data = f.read()
|
|
707
|
+
return raw_data
|
|
708
|
+
|
|
709
|
+
|