ibidem 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibidem-1.0.0/PKG-INFO +29 -0
- ibidem-1.0.0/README +1 -0
- ibidem-1.0.0/ibidem/__init__.py +16 -0
- ibidem-1.0.0/ibidem/cli.py +78 -0
- ibidem-1.0.0/ibidem/config.py +15 -0
- ibidem-1.0.0/ibidem/feature/__init__.py +8 -0
- ibidem-1.0.0/ibidem/feature/highnote.py +42 -0
- ibidem-1.0.0/ibidem/feature/legacy.py +14 -0
- ibidem-1.0.0/ibidem/feature/plain.py +41 -0
- ibidem-1.0.0/ibidem/feature/result.py +142 -0
- ibidem-1.0.0/ibidem/layout.py +307 -0
- ibidem-1.0.0/ibidem/parser/__init__.py +8 -0
- ibidem-1.0.0/ibidem/parser/highnote.py +88 -0
- ibidem-1.0.0/ibidem/parser/plain.py +113 -0
- ibidem-1.0.0/ibidem/parser/textraw.py +111 -0
- ibidem-1.0.0/ibidem/strategy/__init__.py +155 -0
- ibidem-1.0.0/ibidem/strategy/highnote.py +29 -0
- ibidem-1.0.0/ibidem/strategy/moving/__init__.py +8 -0
- ibidem-1.0.0/ibidem/strategy/moving/finish.py +96 -0
- ibidem-1.0.0/ibidem/strategy/moving/judge.py +127 -0
- ibidem-1.0.0/ibidem/strategy/moving/run.py +175 -0
- ibidem-1.0.0/ibidem/strategy/moving/separator.py +106 -0
- ibidem-1.0.0/ibidem/strategy/moving/utils.py +38 -0
- ibidem-1.0.0/ibidem/strategy/plainmoving.py +106 -0
- ibidem-1.0.0/ibidem/utils.py +129 -0
- ibidem-1.0.0/ibidem.egg-info/PKG-INFO +29 -0
- ibidem-1.0.0/ibidem.egg-info/SOURCES.txt +39 -0
- ibidem-1.0.0/ibidem.egg-info/dependency_links.txt +1 -0
- ibidem-1.0.0/ibidem.egg-info/entry_points.txt +2 -0
- ibidem-1.0.0/ibidem.egg-info/requires.txt +16 -0
- ibidem-1.0.0/ibidem.egg-info/top_level.txt +1 -0
- ibidem-1.0.0/pyproject.toml +114 -0
- ibidem-1.0.0/setup.cfg +4 -0
- ibidem-1.0.0/tests/test_footer.py +138 -0
- ibidem-1.0.0/tests/test_footnote.py +98 -0
- ibidem-1.0.0/tests/test_highnotes.py +79 -0
- ibidem-1.0.0/tests/test_judgement.py +24 -0
- ibidem-1.0.0/tests/test_moving.py +145 -0
- ibidem-1.0.0/tests/test_negative.py +40 -0
- ibidem-1.0.0/tests/test_utils.py +58 -0
- ibidem-1.0.0/tests/test_validate.py +87 -0
ibidem-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ibidem
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Author-email: Helmut Konrad Schewe <helmutus@outlook.com>
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/anaticulae/ibidem
|
|
7
|
+
Project-URL: Repository, https://github.com/anaticulae/ibidem
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: iamraw<5.0.0,>=4.91.7
|
|
14
|
+
Requires-Dist: utilo<3.0.0,>=2.109.0
|
|
15
|
+
Requires-Dist: configos<2.0.0,>=1.0.4
|
|
16
|
+
Requires-Dist: geostrat<2.0.0,>=1.5.1
|
|
17
|
+
Requires-Dist: konradus<2.0.0,>=1.0.1
|
|
18
|
+
Requires-Dist: elementae<2.0.0,>=1.0.1
|
|
19
|
+
Requires-Dist: germania<2.0.0,>=1.32.0
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: utilotest==1.0.4; extra == "dev"
|
|
22
|
+
Requires-Dist: hoverpower==1.5.1; extra == "dev"
|
|
23
|
+
Requires-Dist: gennex<2.0.0,>=1.0.3; extra == "dev"
|
|
24
|
+
Requires-Dist: rawmaker==2.40.5; extra == "dev"
|
|
25
|
+
Requires-Dist: pagenumber<2.0.0,>=1.0.0; extra == "dev"
|
|
26
|
+
Requires-Dist: groupmes<2.0.0,>=1.1.0; extra == "dev"
|
|
27
|
+
Requires-Dist: resinf<2.0.0,>=1.0.4; extra == "dev"
|
|
28
|
+
|
|
29
|
+
# footnote
|
ibidem-1.0.0/README
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# footnote
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
import importlib.metadata
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
__version__ = importlib.metadata.version('ibidem')
|
|
14
|
+
|
|
15
|
+
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
|
16
|
+
PROCESS = 'footnote'
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2022 by Helmut Konrad Fahrendholz. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
import utilo
|
|
11
|
+
|
|
12
|
+
import ibidem
|
|
13
|
+
|
|
14
|
+
DESCRIPTION = 'TODO'
|
|
15
|
+
|
|
16
|
+
WORKPLAN = [
|
|
17
|
+
utilo.create_step(
|
|
18
|
+
'plain',
|
|
19
|
+
inputs=[
|
|
20
|
+
utilo.ResultFile(producer='rawmaker', name='text_text'),
|
|
21
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
22
|
+
utilo.ResultFile('rawmaker', name='horizontals_horizontals'),
|
|
23
|
+
],
|
|
24
|
+
output=('plain',),
|
|
25
|
+
),
|
|
26
|
+
utilo.create_step(
|
|
27
|
+
'highnote',
|
|
28
|
+
inputs=[
|
|
29
|
+
utilo.ResultFile(producer='rawmaker', name='text_text'),
|
|
30
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
31
|
+
utilo.ResultFile('rawmaker', name='horizontals_horizontals'),
|
|
32
|
+
],
|
|
33
|
+
output=('highnote',),
|
|
34
|
+
),
|
|
35
|
+
utilo.create_step(
|
|
36
|
+
'result',
|
|
37
|
+
inputs=[
|
|
38
|
+
utilo.ResultFile(producer='footnote', name='highnote_highnote'),
|
|
39
|
+
utilo.ResultFile(producer='footnote', name='plain_plain'),
|
|
40
|
+
],
|
|
41
|
+
output=('result',),
|
|
42
|
+
),
|
|
43
|
+
utilo.create_step(
|
|
44
|
+
'legacy',
|
|
45
|
+
inputs=[
|
|
46
|
+
utilo.ResultFile(producer='footnote', name='result_result'),
|
|
47
|
+
],
|
|
48
|
+
output=('legacy',),
|
|
49
|
+
),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def rename(path):
|
|
54
|
+
if not isinstance(path, str):
|
|
55
|
+
path = [rename(item) for item in path]
|
|
56
|
+
return path
|
|
57
|
+
path = utilo.rreplace(
|
|
58
|
+
path,
|
|
59
|
+
pattern='footnote__legacy_legacy',
|
|
60
|
+
replace='groupme__footer_footerheader',
|
|
61
|
+
)
|
|
62
|
+
return path
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def main():
|
|
66
|
+
utilo.featurepack(
|
|
67
|
+
workplan=WORKPLAN,
|
|
68
|
+
root=ibidem.ROOT,
|
|
69
|
+
featurepackage='ibidem.feature',
|
|
70
|
+
config=utilo.FeaturePackConfig(
|
|
71
|
+
description=DESCRIPTION,
|
|
72
|
+
multiprocessed=True,
|
|
73
|
+
name=ibidem.PROCESS,
|
|
74
|
+
pages=True,
|
|
75
|
+
rename=rename,
|
|
76
|
+
version=ibidem.__version__,
|
|
77
|
+
),
|
|
78
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import configos
|
|
11
|
+
import texmex
|
|
12
|
+
|
|
13
|
+
FOOTER_SEPARATOR_WIDTH_MIN = configos.HV_INT_PLUS(default=70)
|
|
14
|
+
|
|
15
|
+
VISIBLE = texmex.TextState.VISIBLE | texmex.TextState.FOOTNOTE
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import serializeraw
|
|
11
|
+
|
|
12
|
+
import ibidem.config
|
|
13
|
+
import ibidem.strategy.highnote
|
|
14
|
+
import ibidem.utils
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def work(
|
|
18
|
+
text: str,
|
|
19
|
+
textpositions: str,
|
|
20
|
+
horizontals: str,
|
|
21
|
+
pages=None,
|
|
22
|
+
) -> str:
|
|
23
|
+
# load
|
|
24
|
+
horizontals = serializeraw.load_horizontals(
|
|
25
|
+
horizontals,
|
|
26
|
+
pages=pages,
|
|
27
|
+
width_min=ibidem.config.FOOTER_SEPARATOR_WIDTH_MIN,
|
|
28
|
+
)
|
|
29
|
+
ptns = serializeraw.ptn_fromfile(
|
|
30
|
+
text,
|
|
31
|
+
textpositions,
|
|
32
|
+
pages=pages,
|
|
33
|
+
state=ibidem.config.VISIBLE,
|
|
34
|
+
)
|
|
35
|
+
ptns = ibidem.utils.rotate_ifrequired(ptns)
|
|
36
|
+
strategy = ibidem.strategy.highnote.HighnoteStrategy(
|
|
37
|
+
horizontals=horizontals,
|
|
38
|
+
ptns=ptns,
|
|
39
|
+
)
|
|
40
|
+
result = strategy.result()
|
|
41
|
+
dumped = serializeraw.dump_headerfooter(result)
|
|
42
|
+
return dumped
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import utilo
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def work(xresult) -> str:
|
|
14
|
+
return utilo.file_read(xresult)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import serializeraw
|
|
11
|
+
|
|
12
|
+
import ibidem.config
|
|
13
|
+
import ibidem.strategy.plainmoving
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def work(
|
|
17
|
+
text: str,
|
|
18
|
+
textpositions: str,
|
|
19
|
+
horizontals: str,
|
|
20
|
+
pages=None,
|
|
21
|
+
) -> str:
|
|
22
|
+
# load
|
|
23
|
+
horizontals = serializeraw.load_horizontals(
|
|
24
|
+
horizontals,
|
|
25
|
+
pages=pages,
|
|
26
|
+
width_min=ibidem.config.FOOTER_SEPARATOR_WIDTH_MIN,
|
|
27
|
+
)
|
|
28
|
+
ptns = serializeraw.ptn_fromfile(
|
|
29
|
+
text,
|
|
30
|
+
textpositions,
|
|
31
|
+
pages=pages,
|
|
32
|
+
state=ibidem.config.VISIBLE,
|
|
33
|
+
)
|
|
34
|
+
ptns = ibidem.utils.rotate_ifrequired(ptns)
|
|
35
|
+
strategy = ibidem.strategy.plainmoving.PlainMovingStrategy(
|
|
36
|
+
horizontals=horizontals,
|
|
37
|
+
ptns=ptns,
|
|
38
|
+
)
|
|
39
|
+
result = strategy.result()
|
|
40
|
+
dumped = serializeraw.dump_headerfooter(result)
|
|
41
|
+
return dumped
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2022 by Helmut Konrad Fahrendholz. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Footer Extraction Step
|
|
10
|
+
=============================
|
|
11
|
+
|
|
12
|
+
TODO:
|
|
13
|
+
what should we do with empty header/footer
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import collections
|
|
17
|
+
|
|
18
|
+
import iamraw
|
|
19
|
+
import serializeraw
|
|
20
|
+
import utilo
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def work(
|
|
24
|
+
xhighnote: str,
|
|
25
|
+
xplain: str,
|
|
26
|
+
pages=None,
|
|
27
|
+
) -> str:
|
|
28
|
+
"""Extract footer and header area out of horizontal lines.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Dumped list with top and bottom border, which separates the
|
|
32
|
+
content from the footer and or header, for every page
|
|
33
|
+
"""
|
|
34
|
+
highnote = serializeraw.load_headerfooter(
|
|
35
|
+
xhighnote,
|
|
36
|
+
pages=pages,
|
|
37
|
+
)
|
|
38
|
+
plain = serializeraw.load_headerfooter(
|
|
39
|
+
xplain,
|
|
40
|
+
pages=pages,
|
|
41
|
+
)
|
|
42
|
+
# select the best one
|
|
43
|
+
result = judge_strategy((
|
|
44
|
+
highnote,
|
|
45
|
+
plain,
|
|
46
|
+
))
|
|
47
|
+
validate(result)
|
|
48
|
+
# dump
|
|
49
|
+
dumped = serializeraw.dump_headerfooter(result)
|
|
50
|
+
return dumped
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def judge_strategy(
|
|
54
|
+
results: list[iamraw.PageContentFooterHeaders],
|
|
55
|
+
) -> iamraw.PageContentFooterHeaders:
|
|
56
|
+
"""Decide which results fits best.
|
|
57
|
+
|
|
58
|
+
Zip result of different strategies. Sometimes there are multiple
|
|
59
|
+
options, therefore we have to use the priorities below.
|
|
60
|
+
|
|
61
|
+
Sources/Concept:
|
|
62
|
+
|
|
63
|
+
- MovingFooter: footer (first prio)
|
|
64
|
+
- Pages: footer (second prio)
|
|
65
|
+
- FixedFooter: header and footer (third prio)
|
|
66
|
+
- Common: header (last prio)
|
|
67
|
+
- PlainMoving:
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
results: lists of `ibidem.FootnoteDetectionStrategy`.result
|
|
71
|
+
Returns:
|
|
72
|
+
list of zipped result
|
|
73
|
+
"""
|
|
74
|
+
assert results is not None, 'require list of strategy results'
|
|
75
|
+
result = []
|
|
76
|
+
for pdfpage, (
|
|
77
|
+
moving,
|
|
78
|
+
plainmoving,
|
|
79
|
+
) in utilo.sync_pages(results):
|
|
80
|
+
footer = moving.footer if moving else None
|
|
81
|
+
footer_best = 'moving' if moving else None
|
|
82
|
+
# strategy: moving
|
|
83
|
+
if moving and moving.footer and moving.footer.notes:
|
|
84
|
+
footer = moving.footer
|
|
85
|
+
footer_best = 'moving'
|
|
86
|
+
# strategy: plain
|
|
87
|
+
if not (moving and moving.footer) and plainmoving and plainmoving.footer: # yapf:disable
|
|
88
|
+
# use plain moving only if no other strategy works
|
|
89
|
+
footer = plainmoving.footer
|
|
90
|
+
footer_best = 'plain'
|
|
91
|
+
# log footer best
|
|
92
|
+
if footer_best:
|
|
93
|
+
utilo.verbose(f'footer: {pdfpage} {footer_best}')
|
|
94
|
+
current = iamraw.PageContentFooterHeader(
|
|
95
|
+
footer=footer,
|
|
96
|
+
page=pdfpage,
|
|
97
|
+
)
|
|
98
|
+
result.append(current)
|
|
99
|
+
|
|
100
|
+
page_order = [item.page for item in result]
|
|
101
|
+
assert utilo.isascending(page_order), page_order
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def quality(results: list) -> tuple:
|
|
106
|
+
"""Determine quality[0.0, 1.0] of every extraction strategy."""
|
|
107
|
+
# count number of page
|
|
108
|
+
pages = set()
|
|
109
|
+
# count result for every strategy
|
|
110
|
+
counter = collections.defaultdict(int)
|
|
111
|
+
for pdfpage, data in utilo.sync_pages(results):
|
|
112
|
+
pages.add(pdfpage)
|
|
113
|
+
for index, item in enumerate(data):
|
|
114
|
+
if not item:
|
|
115
|
+
continue
|
|
116
|
+
counter[index] += 1
|
|
117
|
+
result = tuple(counter[index] / len(pages) if pages else 0
|
|
118
|
+
for index, _ in enumerate(results))
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def validate(items: list):
|
|
123
|
+
"""Validate list of pageable items.
|
|
124
|
+
|
|
125
|
+
If some `page` attribute is duplicated, raise ValueError.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
items(list): list of objects with <page,content>
|
|
129
|
+
Raises:
|
|
130
|
+
ValueError: if some page attribute is duplicated.
|
|
131
|
+
"""
|
|
132
|
+
# TODO: REMOVE AFTER UPGRADING IAMRAW
|
|
133
|
+
counter = collections.Counter()
|
|
134
|
+
for item in items:
|
|
135
|
+
counter[item.page] += 1
|
|
136
|
+
msg = []
|
|
137
|
+
for page, value in counter.most_common():
|
|
138
|
+
if value <= 1:
|
|
139
|
+
continue
|
|
140
|
+
msg.append(f'duplicated page: {page} ({value})')
|
|
141
|
+
if msg:
|
|
142
|
+
raise ValueError(utilo.NEWLINE.join(msg))
|