ocrd 3.0.0b6__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +3 -1
- ocrd/decorators/__init__.py +3 -2
- ocrd/mets_server.py +62 -42
- ocrd/processor/base.py +25 -9
- ocrd/processor/builtin/dummy/ocrd-tool.json +20 -0
- ocrd/processor/builtin/dummy_processor.py +0 -3
- ocrd/processor/builtin/filter_processor.py +108 -0
- ocrd/resource_manager.py +4 -0
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/METADATA +2 -1
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/RECORD +34 -32
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/entry_points.txt +1 -0
- ocrd_modelfactory/__init__.py +7 -1
- ocrd_models/ocrd_exif.py +2 -2
- ocrd_models/ocrd_page.py +22 -3
- ocrd_models/ocrd_page_generateds.py +2813 -1438
- ocrd_models/xpath_functions.py +51 -0
- ocrd_network/cli/client.py +27 -8
- ocrd_network/client.py +9 -6
- ocrd_network/client_utils.py +25 -14
- ocrd_network/processing_server.py +27 -15
- ocrd_network/processing_worker.py +7 -4
- ocrd_network/processor_server.py +2 -1
- ocrd_network/rabbitmq_utils/connector.py +2 -2
- ocrd_network/runtime_data/deployer.py +28 -18
- ocrd_network/server_cache.py +26 -23
- ocrd_network/server_utils.py +40 -4
- ocrd_network/tcp_to_uds_mets_proxy.py +8 -5
- ocrd_network/utils.py +19 -15
- ocrd_utils/config.py +38 -16
- ocrd_utils/logging.py +27 -56
- ocrd_utils/ocrd_logging.conf +14 -16
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/WHEEL +0 -0
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/top_level.txt +0 -0
ocrd/cli/__init__.py
CHANGED
|
@@ -16,7 +16,7 @@ def command_with_replaced_help(*replacements):
|
|
|
16
16
|
|
|
17
17
|
class CommandWithReplacedHelp(click.Command):
|
|
18
18
|
def get_help(self, ctx):
|
|
19
|
-
newhelp = super().get_help(ctx)
|
|
19
|
+
newhelp : str = super().get_help(ctx)
|
|
20
20
|
for replacement in replacements:
|
|
21
21
|
newhelp = re.sub(*replacement, newhelp)
|
|
22
22
|
# print(newhelp)
|
|
@@ -83,6 +83,8 @@ Variables:
|
|
|
83
83
|
\b
|
|
84
84
|
{config.describe('OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS')}
|
|
85
85
|
\b
|
|
86
|
+
{config.describe('OCRD_NETWORK_RABBITMQ_HEARTBEAT')}
|
|
87
|
+
\b
|
|
86
88
|
{config.describe('OCRD_PROFILE_FILE')}
|
|
87
89
|
\b
|
|
88
90
|
{config.describe('OCRD_PROFILE', wrap_text=False)}
|
ocrd/decorators/__init__.py
CHANGED
|
@@ -48,6 +48,9 @@ def ocrd_cli_wrap_processor(
|
|
|
48
48
|
# ocrd_network params end #
|
|
49
49
|
**kwargs
|
|
50
50
|
):
|
|
51
|
+
# init logging handlers so no imported libs can preempt ours
|
|
52
|
+
initLogging()
|
|
53
|
+
|
|
51
54
|
# FIXME: remove workspace arg entirely
|
|
52
55
|
processor = processorClass(None)
|
|
53
56
|
if not sys.argv[1:]:
|
|
@@ -89,8 +92,6 @@ def ocrd_cli_wrap_processor(
|
|
|
89
92
|
# Used for checking/starting network agents for the WebAPI architecture
|
|
90
93
|
check_and_run_network_agent(processorClass, subcommand, address, database, queue)
|
|
91
94
|
|
|
92
|
-
# from here: single-run processing context
|
|
93
|
-
initLogging()
|
|
94
95
|
if 'parameter' in kwargs:
|
|
95
96
|
# Disambiguate parameter file/literal, and resolve file
|
|
96
97
|
def resolve(name):
|
ocrd/mets_server.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
# METS server functionality
|
|
3
3
|
"""
|
|
4
|
+
import os
|
|
4
5
|
import re
|
|
5
6
|
from os import _exit, chmod
|
|
7
|
+
import signal
|
|
6
8
|
from typing import Dict, Optional, Union, List, Tuple
|
|
7
9
|
from time import sleep
|
|
8
10
|
from pathlib import Path
|
|
@@ -155,13 +157,13 @@ class ClientSideOcrdMets:
|
|
|
155
157
|
Request writing the changes to the file system
|
|
156
158
|
"""
|
|
157
159
|
if not self.multiplexing_mode:
|
|
158
|
-
self.session.request("PUT", url=self.url)
|
|
160
|
+
return self.session.request("PUT", url=self.url).text
|
|
159
161
|
else:
|
|
160
|
-
self.session.request(
|
|
162
|
+
return self.session.request(
|
|
161
163
|
"POST",
|
|
162
164
|
self.url,
|
|
163
165
|
json=MpxReq.save(self.ws_dir_path)
|
|
164
|
-
)
|
|
166
|
+
).json()["text"]
|
|
165
167
|
|
|
166
168
|
def stop(self):
|
|
167
169
|
"""
|
|
@@ -169,14 +171,13 @@ class ClientSideOcrdMets:
|
|
|
169
171
|
"""
|
|
170
172
|
try:
|
|
171
173
|
if not self.multiplexing_mode:
|
|
172
|
-
self.session.request("DELETE", self.url)
|
|
173
|
-
return
|
|
174
|
+
return self.session.request("DELETE", self.url).text
|
|
174
175
|
else:
|
|
175
|
-
self.session.request(
|
|
176
|
+
return self.session.request(
|
|
176
177
|
"POST",
|
|
177
178
|
self.url,
|
|
178
179
|
json=MpxReq.stop(self.ws_dir_path)
|
|
179
|
-
)
|
|
180
|
+
).json()["text"]
|
|
180
181
|
except ConnectionError:
|
|
181
182
|
# Expected because we exit the process without returning
|
|
182
183
|
pass
|
|
@@ -323,7 +324,7 @@ class ClientSideOcrdMets:
|
|
|
323
324
|
|
|
324
325
|
|
|
325
326
|
class MpxReq:
|
|
326
|
-
"""This class
|
|
327
|
+
"""This class wraps the request bodies needed for the tcp forwarding
|
|
327
328
|
|
|
328
329
|
For every mets-server-call like find_files or workspace_path a special request_body is
|
|
329
330
|
needed to call `MetsServerProxy.forward_tcp_request`. These are created by this functions.
|
|
@@ -346,12 +347,12 @@ class MpxReq:
|
|
|
346
347
|
@staticmethod
|
|
347
348
|
def save(ws_dir_path: str) -> Dict:
|
|
348
349
|
return MpxReq.__args_wrapper(
|
|
349
|
-
ws_dir_path, method_type="PUT", response_type="
|
|
350
|
+
ws_dir_path, method_type="PUT", response_type="text", request_url="", request_data={})
|
|
350
351
|
|
|
351
352
|
@staticmethod
|
|
352
353
|
def stop(ws_dir_path: str) -> Dict:
|
|
353
354
|
return MpxReq.__args_wrapper(
|
|
354
|
-
ws_dir_path, method_type="DELETE", response_type="
|
|
355
|
+
ws_dir_path, method_type="DELETE", response_type="text", request_url="", request_data={})
|
|
355
356
|
|
|
356
357
|
@staticmethod
|
|
357
358
|
def reload(ws_dir_path: str) -> Dict:
|
|
@@ -428,18 +429,24 @@ class OcrdMetsServer:
|
|
|
428
429
|
|
|
429
430
|
@staticmethod
|
|
430
431
|
def kill_process(mets_server_pid: int):
|
|
431
|
-
|
|
432
|
+
os.kill(mets_server_pid, signal.SIGINT)
|
|
433
|
+
sleep(3)
|
|
434
|
+
try:
|
|
435
|
+
os.kill(mets_server_pid, signal.SIGKILL)
|
|
436
|
+
except ProcessLookupError as e:
|
|
437
|
+
pass
|
|
432
438
|
|
|
433
439
|
def shutdown(self):
|
|
440
|
+
pid = os.getpid()
|
|
441
|
+
self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.")
|
|
442
|
+
os.kill(pid, signal.SIGTERM)
|
|
434
443
|
if self.is_uds:
|
|
435
444
|
if Path(self.url).exists():
|
|
436
|
-
self.log.
|
|
445
|
+
self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}")
|
|
437
446
|
Path(self.url).unlink()
|
|
438
|
-
# os._exit because uvicorn catches SystemExit raised by sys.exit
|
|
439
|
-
_exit(0)
|
|
440
447
|
|
|
441
448
|
def startup(self):
|
|
442
|
-
self.log.info("
|
|
449
|
+
self.log.info(f"Configuring the Mets Server")
|
|
443
450
|
|
|
444
451
|
workspace = self.workspace
|
|
445
452
|
|
|
@@ -465,32 +472,49 @@ class OcrdMetsServer:
|
|
|
465
472
|
"""
|
|
466
473
|
Write current changes to the file system
|
|
467
474
|
"""
|
|
468
|
-
|
|
475
|
+
workspace.save_mets()
|
|
476
|
+
response = Response(content="The Mets Server is writing changes to disk.", media_type='text/plain')
|
|
477
|
+
self.log.info(f"PUT / -> {response.__dict__}")
|
|
478
|
+
return response
|
|
469
479
|
|
|
470
480
|
@app.delete(path='/')
|
|
471
|
-
|
|
481
|
+
def stop():
|
|
472
482
|
"""
|
|
473
483
|
Stop the mets server
|
|
474
484
|
"""
|
|
475
|
-
getLogger('ocrd.models.ocrd_mets').info(f'Shutting down METS Server {self.url}')
|
|
476
485
|
workspace.save_mets()
|
|
486
|
+
response = Response(content="The Mets Server will shut down soon...", media_type='text/plain')
|
|
477
487
|
self.shutdown()
|
|
488
|
+
self.log.info(f"DELETE / -> {response.__dict__}")
|
|
489
|
+
return response
|
|
478
490
|
|
|
479
491
|
@app.post(path='/reload')
|
|
480
|
-
|
|
492
|
+
def workspace_reload_mets():
|
|
481
493
|
"""
|
|
482
494
|
Reload mets file from the file system
|
|
483
495
|
"""
|
|
484
496
|
workspace.reload_mets()
|
|
485
|
-
|
|
497
|
+
response = Response(content=f"Reloaded from {workspace.directory}", media_type='text/plain')
|
|
498
|
+
self.log.info(f"POST /reload -> {response.__dict__}")
|
|
499
|
+
return response
|
|
486
500
|
|
|
487
501
|
@app.get(path='/unique_identifier', response_model=str)
|
|
488
502
|
async def unique_identifier():
|
|
489
|
-
|
|
503
|
+
response = Response(content=workspace.mets.unique_identifier, media_type='text/plain')
|
|
504
|
+
self.log.info(f"GET /unique_identifier -> {response.__dict__}")
|
|
505
|
+
return response
|
|
490
506
|
|
|
491
507
|
@app.get(path='/workspace_path', response_model=str)
|
|
492
508
|
async def workspace_path():
|
|
493
|
-
|
|
509
|
+
response = Response(content=workspace.directory, media_type="text/plain")
|
|
510
|
+
self.log.info(f"GET /workspace_path -> {response.__dict__}")
|
|
511
|
+
return response
|
|
512
|
+
|
|
513
|
+
@app.get(path='/physical_pages', response_model=OcrdPageListModel)
|
|
514
|
+
async def physical_pages():
|
|
515
|
+
response = {'physical_pages': workspace.mets.physical_pages}
|
|
516
|
+
self.log.info(f"GET /physical_pages -> {response}")
|
|
517
|
+
return response
|
|
494
518
|
|
|
495
519
|
@app.get(path='/physical_pages', response_model=OcrdPageListModel)
|
|
496
520
|
async def physical_pages():
|
|
@@ -498,18 +522,24 @@ class OcrdMetsServer:
|
|
|
498
522
|
|
|
499
523
|
@app.get(path='/file_groups', response_model=OcrdFileGroupListModel)
|
|
500
524
|
async def file_groups():
|
|
501
|
-
|
|
525
|
+
response = {'file_groups': workspace.mets.file_groups}
|
|
526
|
+
self.log.info(f"GET /file_groups -> {response}")
|
|
527
|
+
return response
|
|
502
528
|
|
|
503
529
|
@app.get(path='/agent', response_model=OcrdAgentListModel)
|
|
504
530
|
async def agents():
|
|
505
|
-
|
|
531
|
+
response = OcrdAgentListModel.create(workspace.mets.agents)
|
|
532
|
+
self.log.info(f"GET /agent -> {response.__dict__}")
|
|
533
|
+
return response
|
|
506
534
|
|
|
507
535
|
@app.post(path='/agent', response_model=OcrdAgentModel)
|
|
508
536
|
async def add_agent(agent: OcrdAgentModel):
|
|
509
537
|
kwargs = agent.dict()
|
|
510
538
|
kwargs['_type'] = kwargs.pop('type')
|
|
511
539
|
workspace.mets.add_agent(**kwargs)
|
|
512
|
-
|
|
540
|
+
response = agent
|
|
541
|
+
self.log.info(f"POST /agent -> {response.__dict__}")
|
|
542
|
+
return response
|
|
513
543
|
|
|
514
544
|
@app.get(path="/file", response_model=OcrdFileListModel)
|
|
515
545
|
async def find_files(
|
|
@@ -526,7 +556,9 @@ class OcrdMetsServer:
|
|
|
526
556
|
found = workspace.mets.find_all_files(
|
|
527
557
|
fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, local_filename=local_filename, url=url
|
|
528
558
|
)
|
|
529
|
-
|
|
559
|
+
response = OcrdFileListModel.create(found)
|
|
560
|
+
self.log.info(f"GET /file -> {response.__dict__}")
|
|
561
|
+
return response
|
|
530
562
|
|
|
531
563
|
@app.post(path='/file', response_model=OcrdFileModel)
|
|
532
564
|
async def add_file(
|
|
@@ -549,7 +581,9 @@ class OcrdMetsServer:
|
|
|
549
581
|
# Add to workspace
|
|
550
582
|
kwargs = file_resource.dict()
|
|
551
583
|
workspace.add_file(**kwargs, force=force)
|
|
552
|
-
|
|
584
|
+
response = file_resource
|
|
585
|
+
self.log.info(f"POST /file -> {response.__dict__}")
|
|
586
|
+
return response
|
|
553
587
|
|
|
554
588
|
# ------------- #
|
|
555
589
|
|
|
@@ -557,9 +591,6 @@ class OcrdMetsServer:
|
|
|
557
591
|
# Create socket and change to world-readable and -writable to avoid permission errors
|
|
558
592
|
self.log.debug(f"chmod 0o677 {self.url}")
|
|
559
593
|
server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
560
|
-
if Path(self.url).exists() and not is_socket_in_use(self.url):
|
|
561
|
-
# remove leftover unused socket which blocks startup
|
|
562
|
-
Path(self.url).unlink()
|
|
563
594
|
server.bind(self.url) # creates the socket file
|
|
564
595
|
atexit.register(self.shutdown)
|
|
565
596
|
server.close()
|
|
@@ -571,16 +602,5 @@ class OcrdMetsServer:
|
|
|
571
602
|
uvicorn_kwargs['log_config'] = None
|
|
572
603
|
uvicorn_kwargs['access_log'] = False
|
|
573
604
|
|
|
574
|
-
self.log.
|
|
605
|
+
self.log.info("Starting the uvicorn Mets Server")
|
|
575
606
|
uvicorn.run(app, **uvicorn_kwargs)
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
def is_socket_in_use(socket_path):
|
|
579
|
-
if Path(socket_path).exists():
|
|
580
|
-
client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
581
|
-
try:
|
|
582
|
-
client.connect(socket_path)
|
|
583
|
-
except OSError:
|
|
584
|
-
return False
|
|
585
|
-
client.close()
|
|
586
|
-
return True
|
ocrd/processor/base.py
CHANGED
|
@@ -18,10 +18,11 @@ from os import getcwd
|
|
|
18
18
|
from pathlib import Path
|
|
19
19
|
from typing import Any, Dict, List, Optional, Tuple, Union, get_args
|
|
20
20
|
import sys
|
|
21
|
+
import logging
|
|
22
|
+
import logging.handlers
|
|
21
23
|
import inspect
|
|
22
24
|
import tarfile
|
|
23
25
|
import io
|
|
24
|
-
import weakref
|
|
25
26
|
from collections import defaultdict
|
|
26
27
|
from frozendict import frozendict
|
|
27
28
|
# concurrent.futures is buggy in py38,
|
|
@@ -129,7 +130,8 @@ class DummyExecutor:
|
|
|
129
130
|
def __init__(self, initializer=None, initargs=(), **kwargs):
|
|
130
131
|
initializer(*initargs)
|
|
131
132
|
def shutdown(self, **kwargs):
|
|
132
|
-
|
|
133
|
+
# allow gc to catch processor instance (unless cached)
|
|
134
|
+
_page_worker_set_ctxt(None, None)
|
|
133
135
|
def submit(self, fn, *args, **kwargs) -> DummyFuture:
|
|
134
136
|
return DummyFuture(fn, *args, **kwargs)
|
|
135
137
|
|
|
@@ -158,12 +160,12 @@ class Processor():
|
|
|
158
160
|
|
|
159
161
|
max_workers : int = -1
|
|
160
162
|
"""
|
|
161
|
-
maximum number of processor
|
|
163
|
+
maximum number of processor forks for page-parallel processing (ignored if negative),
|
|
162
164
|
to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
|
|
163
165
|
whatever is smaller).
|
|
164
166
|
|
|
165
167
|
(Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
|
|
166
|
-
- at once, or if your class
|
|
168
|
+
- at once, or if your class already creates threads prior to forking, e.g. during ``setup``.)
|
|
167
169
|
"""
|
|
168
170
|
|
|
169
171
|
max_page_seconds : int = -1
|
|
@@ -366,12 +368,14 @@ class Processor():
|
|
|
366
368
|
self._base_logger = getLogger('ocrd.processor.base')
|
|
367
369
|
if parameter is not None:
|
|
368
370
|
self.parameter = parameter
|
|
369
|
-
# ensure that shutdown gets called at destruction
|
|
370
|
-
self._finalizer = weakref.finalize(self, self.shutdown)
|
|
371
371
|
# workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
|
|
372
372
|
setattr(self, 'process',
|
|
373
373
|
deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
|
|
374
374
|
|
|
375
|
+
def __del__(self):
|
|
376
|
+
self._base_logger.debug("shutting down %s in %s", repr(self), mp.current_process().name)
|
|
377
|
+
self.shutdown()
|
|
378
|
+
|
|
375
379
|
def show_help(self, subcommand=None):
|
|
376
380
|
"""
|
|
377
381
|
Print a usage description including the standard CLI and all of this processor's ocrd-tool
|
|
@@ -502,7 +506,7 @@ class Processor():
|
|
|
502
506
|
# set up multitasking
|
|
503
507
|
max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
|
|
504
508
|
if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
|
|
505
|
-
self._base_logger.info("limiting number of
|
|
509
|
+
self._base_logger.info("limiting number of workers from %d to %d", max_workers, self.max_workers)
|
|
506
510
|
max_workers = self.max_workers
|
|
507
511
|
if max_workers > 1:
|
|
508
512
|
assert isinstance(workspace.mets, ClientSideOcrdMets), \
|
|
@@ -514,22 +518,31 @@ class Processor():
|
|
|
514
518
|
|
|
515
519
|
if max_workers > 1:
|
|
516
520
|
executor_cls = ProcessPoolExecutor
|
|
521
|
+
log_queue = mp.Queue()
|
|
517
522
|
else:
|
|
518
523
|
executor_cls = DummyExecutor
|
|
524
|
+
log_queue = None
|
|
519
525
|
executor = executor_cls(
|
|
520
526
|
max_workers=max_workers or 1,
|
|
521
527
|
# only forking method avoids pickling
|
|
522
528
|
context=mp.get_context('fork'),
|
|
523
529
|
# share processor instance as global to avoid pickling
|
|
524
530
|
initializer=_page_worker_set_ctxt,
|
|
525
|
-
initargs=(self,),
|
|
531
|
+
initargs=(self, log_queue),
|
|
526
532
|
)
|
|
533
|
+
if max_workers > 1:
|
|
534
|
+
# forward messages from log queue (in subprocesses) to all root handlers
|
|
535
|
+
log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True)
|
|
536
|
+
log_listener.start()
|
|
527
537
|
try:
|
|
528
538
|
self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
|
|
529
539
|
tasks = self.process_workspace_submit_tasks(executor, max_seconds)
|
|
530
540
|
stats = self.process_workspace_handle_tasks(tasks)
|
|
531
541
|
finally:
|
|
532
542
|
executor.shutdown(kill_workers=True, wait=False)
|
|
543
|
+
if max_workers > 1:
|
|
544
|
+
log_listener.stop()
|
|
545
|
+
del log_listener
|
|
533
546
|
|
|
534
547
|
except NotImplementedError:
|
|
535
548
|
# fall back to deprecated method
|
|
@@ -1109,13 +1122,16 @@ in Processor.process_workspace. Forking allows inheriting global
|
|
|
1109
1122
|
objects, and with the METS Server we do not mutate the local
|
|
1110
1123
|
processor instance anyway.
|
|
1111
1124
|
"""
|
|
1112
|
-
def _page_worker_set_ctxt(processor):
|
|
1125
|
+
def _page_worker_set_ctxt(processor, log_queue):
|
|
1113
1126
|
"""
|
|
1114
1127
|
Overwrites `ocrd.processor.base._page_worker_processor` instance
|
|
1115
1128
|
for sharing with subprocesses in ProcessPoolExecutor initializer.
|
|
1116
1129
|
"""
|
|
1117
1130
|
global _page_worker_processor
|
|
1118
1131
|
_page_worker_processor = processor
|
|
1132
|
+
if log_queue:
|
|
1133
|
+
# replace all log handlers with just one queue handler
|
|
1134
|
+
logging.root.handlers = [logging.handlers.QueueHandler(log_queue)]
|
|
1119
1135
|
|
|
1120
1136
|
def _page_worker(timeout, *input_files):
|
|
1121
1137
|
"""
|
|
@@ -16,6 +16,26 @@
|
|
|
16
16
|
"description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)"
|
|
17
17
|
}
|
|
18
18
|
}
|
|
19
|
+
},
|
|
20
|
+
"ocrd-filter": {
|
|
21
|
+
"executable": "ocrd-filter",
|
|
22
|
+
"description": "Bare-bones processor can be dynamically configured to remove segments based on XPath queries",
|
|
23
|
+
"steps": ["recognition/post-correction"],
|
|
24
|
+
"categories": ["Quality assurance"],
|
|
25
|
+
"input_file_grp_cardinality": 1,
|
|
26
|
+
"output_file_grp_cardinality": 1,
|
|
27
|
+
"parameters": {
|
|
28
|
+
"select": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"default": "//*[ends-with(local-name(),'Region')]",
|
|
31
|
+
"description": "Which segments to select for removal. An XPath 2.0 query expression (path and optional predicates), with 'pc' as namespace prefix for PAGE-XML and our extension functions (see help text). Only selection of segment hierarchy elements is allowed (so e.g. `*` would be equivalent to `pc:NoiseRegion|pc:LineDrawingRegion|pc:AdvertRegion|pc:ImageRegion|pc:ChartRegion|pc:MusicRegion|pc:GraphicRegion|pc:UnknownRegion|pc:CustomRegion|pc:SeparatorRegion|pc:MathsRegion|pc:TextRegion|pc:MapRegion|pc:ChemRegion|pc:TableRegion|pc:TextLine|pc:Word|pc:Glyph`, but `pc:MetadataItem` or `pc:Border` or `pc:Coords` would not match).\nFor example, to remove words or glyphs with low text confidence, select '(pc:Word|pc:Glyph)[pc:TextEquiv/@conf < 0.7]'. Or low layout confidence, '*[pc:Coords/@conf < 0.7]'.\nTo remove high pixel-to-character rate, select '*[pc:pixelarea(.) div string-length(pc:textequiv(.)) > 10000]'."
|
|
32
|
+
},
|
|
33
|
+
"plot": {
|
|
34
|
+
"type": "boolean",
|
|
35
|
+
"default": false,
|
|
36
|
+
"description": "Whether to extract an image for each filtered segment and write to the output fileGrp."
|
|
37
|
+
}
|
|
38
|
+
}
|
|
19
39
|
}
|
|
20
40
|
}
|
|
21
41
|
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# pylint: disable=missing-module-docstring,invalid-name
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from lxml import etree
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
|
|
8
|
+
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
|
9
|
+
from ocrd_models import OcrdPage
|
|
10
|
+
|
|
11
|
+
_SEGTYPES = [
|
|
12
|
+
"NoiseRegion",
|
|
13
|
+
"LineDrawingRegion",
|
|
14
|
+
"AdvertRegion",
|
|
15
|
+
"ImageRegion",
|
|
16
|
+
"ChartRegion",
|
|
17
|
+
"MusicRegion",
|
|
18
|
+
"GraphicRegion",
|
|
19
|
+
"UnknownRegion",
|
|
20
|
+
"CustomRegion",
|
|
21
|
+
"SeparatorRegion",
|
|
22
|
+
"MathsRegion",
|
|
23
|
+
"TextRegion",
|
|
24
|
+
"MapRegion",
|
|
25
|
+
"ChemRegion",
|
|
26
|
+
"TableRegion",
|
|
27
|
+
"TextLine",
|
|
28
|
+
"Word",
|
|
29
|
+
"Glyph"
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
class FilterProcessor(Processor):
|
|
33
|
+
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
|
34
|
+
"""
|
|
35
|
+
Remove PAGE segment hierarchy elements based on flexible selection criteria.
|
|
36
|
+
|
|
37
|
+
Open and deserialise PAGE input file, then iterate over the segment hierarchy
|
|
38
|
+
down to the level required for ``select`` (which could be multiple levels at once).
|
|
39
|
+
|
|
40
|
+
Remove any segments matching XPath query ``select`` from that hierarchy (and from
|
|
41
|
+
the `ReadingOrder` if it is a region type).
|
|
42
|
+
|
|
43
|
+
\b
|
|
44
|
+
Besides full XPath 2.0 syntax, this supports extra predicates:
|
|
45
|
+
- `pc:pixelarea()` for the number of pixels of the bounding box (or sum area on node sets),
|
|
46
|
+
- `pc:textequiv()` for the first TextEquiv unicode string (or concatenated string on node sets).
|
|
47
|
+
|
|
48
|
+
If ``plot`` is `true`, then extract and write an image file for all removed segments
|
|
49
|
+
to the output fileGrp (without reference to the PAGE).
|
|
50
|
+
|
|
51
|
+
Produce a new PAGE output file by serialising the resulting hierarchy.
|
|
52
|
+
"""
|
|
53
|
+
pcgts = input_pcgts[0]
|
|
54
|
+
result = OcrdPageResult(pcgts)
|
|
55
|
+
nodes = pcgts.xpath(self.parameter['select'])
|
|
56
|
+
# get PAGE objects from matching etree nodes
|
|
57
|
+
# but allow only hierarchy segments
|
|
58
|
+
segments = [segment for segment in map(pcgts.revmap.get, nodes)
|
|
59
|
+
if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
|
|
60
|
+
if not(len(segments)):
|
|
61
|
+
self.logger.info("no matches")
|
|
62
|
+
return result
|
|
63
|
+
rodict = pcgts.get_Page().get_ReadingOrderGroups()
|
|
64
|
+
if self.parameter['plot']:
|
|
65
|
+
page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id)
|
|
66
|
+
for segment in segments:
|
|
67
|
+
segtype = segment.original_tagname_
|
|
68
|
+
self.logger.info("matched %s segment %s", segtype, segment.id)
|
|
69
|
+
parent = segment.parent_object_
|
|
70
|
+
partype = parent.__class__.__name__.replace('Type', '')
|
|
71
|
+
if partype == 'Page':
|
|
72
|
+
getattr(parent, 'get_' + segtype)().remove(segment)
|
|
73
|
+
elif partype.endswith('Region'):
|
|
74
|
+
if segtype.endswith('Region'):
|
|
75
|
+
getattr(parent, 'get_' + segtype)().remove(segment)
|
|
76
|
+
else:
|
|
77
|
+
parent.TextLine.remove(segment)
|
|
78
|
+
elif partype == 'TextLine':
|
|
79
|
+
parent.Word.remove(segment)
|
|
80
|
+
elif partype == 'Word':
|
|
81
|
+
parent.Glyph.remove(segment)
|
|
82
|
+
else:
|
|
83
|
+
raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})")
|
|
84
|
+
segment.parent_object_ = None
|
|
85
|
+
if segtype.endswith('Region') and segment.id in rodict:
|
|
86
|
+
# remove from ReadingOrder as well
|
|
87
|
+
roelem = rodict[segment.id]
|
|
88
|
+
rorefs = getattr(roelem.parent_object_, roelem.__class__.__name__.replace('Type', ''))
|
|
89
|
+
rorefs.remove(roelem)
|
|
90
|
+
roelem.parent_object_ = None
|
|
91
|
+
del rodict[segment.id]
|
|
92
|
+
if self.parameter['plot']:
|
|
93
|
+
segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords)
|
|
94
|
+
result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None))
|
|
95
|
+
return result
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def metadata_filename(self):
|
|
99
|
+
return 'processor/builtin/dummy/ocrd-tool.json'
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def executable(self):
|
|
103
|
+
return 'ocrd-filter'
|
|
104
|
+
|
|
105
|
+
@click.command()
|
|
106
|
+
@ocrd_cli_options
|
|
107
|
+
def cli(*args, **kwargs):
|
|
108
|
+
return ocrd_cli_wrap_processor(FilterProcessor, *args, **kwargs)
|
ocrd/resource_manager.py
CHANGED
|
@@ -23,6 +23,10 @@ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'
|
|
|
23
23
|
|
|
24
24
|
# pylint: enable=wrong-import-position
|
|
25
25
|
|
|
26
|
+
# pylint: enable=wrong-import-position
|
|
27
|
+
|
|
28
|
+
# pylint: enable=wrong-import-position
|
|
29
|
+
|
|
26
30
|
from ocrd_validators import OcrdResourceListValidator
|
|
27
31
|
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
|
|
28
32
|
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.1
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -17,6 +17,7 @@ Requires-Dist: click>=7
|
|
|
17
17
|
Requires-Dist: cryptography<43.0.0
|
|
18
18
|
Requires-Dist: Deprecated==1.2.0
|
|
19
19
|
Requires-Dist: docker
|
|
20
|
+
Requires-Dist: elementpath
|
|
20
21
|
Requires-Dist: fastapi>=0.78.0
|
|
21
22
|
Requires-Dist: filetype
|
|
22
23
|
Requires-Dist: Flask
|