ibidem 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. ibidem-1.0.0/PKG-INFO +29 -0
  2. ibidem-1.0.0/README +1 -0
  3. ibidem-1.0.0/ibidem/__init__.py +16 -0
  4. ibidem-1.0.0/ibidem/cli.py +78 -0
  5. ibidem-1.0.0/ibidem/config.py +15 -0
  6. ibidem-1.0.0/ibidem/feature/__init__.py +8 -0
  7. ibidem-1.0.0/ibidem/feature/highnote.py +42 -0
  8. ibidem-1.0.0/ibidem/feature/legacy.py +14 -0
  9. ibidem-1.0.0/ibidem/feature/plain.py +41 -0
  10. ibidem-1.0.0/ibidem/feature/result.py +142 -0
  11. ibidem-1.0.0/ibidem/layout.py +307 -0
  12. ibidem-1.0.0/ibidem/parser/__init__.py +8 -0
  13. ibidem-1.0.0/ibidem/parser/highnote.py +88 -0
  14. ibidem-1.0.0/ibidem/parser/plain.py +113 -0
  15. ibidem-1.0.0/ibidem/parser/textraw.py +111 -0
  16. ibidem-1.0.0/ibidem/strategy/__init__.py +155 -0
  17. ibidem-1.0.0/ibidem/strategy/highnote.py +29 -0
  18. ibidem-1.0.0/ibidem/strategy/moving/__init__.py +8 -0
  19. ibidem-1.0.0/ibidem/strategy/moving/finish.py +96 -0
  20. ibidem-1.0.0/ibidem/strategy/moving/judge.py +127 -0
  21. ibidem-1.0.0/ibidem/strategy/moving/run.py +175 -0
  22. ibidem-1.0.0/ibidem/strategy/moving/separator.py +106 -0
  23. ibidem-1.0.0/ibidem/strategy/moving/utils.py +38 -0
  24. ibidem-1.0.0/ibidem/strategy/plainmoving.py +106 -0
  25. ibidem-1.0.0/ibidem/utils.py +129 -0
  26. ibidem-1.0.0/ibidem.egg-info/PKG-INFO +29 -0
  27. ibidem-1.0.0/ibidem.egg-info/SOURCES.txt +39 -0
  28. ibidem-1.0.0/ibidem.egg-info/dependency_links.txt +1 -0
  29. ibidem-1.0.0/ibidem.egg-info/entry_points.txt +2 -0
  30. ibidem-1.0.0/ibidem.egg-info/requires.txt +16 -0
  31. ibidem-1.0.0/ibidem.egg-info/top_level.txt +1 -0
  32. ibidem-1.0.0/pyproject.toml +114 -0
  33. ibidem-1.0.0/setup.cfg +4 -0
  34. ibidem-1.0.0/tests/test_footer.py +138 -0
  35. ibidem-1.0.0/tests/test_footnote.py +98 -0
  36. ibidem-1.0.0/tests/test_highnotes.py +79 -0
  37. ibidem-1.0.0/tests/test_judgement.py +24 -0
  38. ibidem-1.0.0/tests/test_moving.py +145 -0
  39. ibidem-1.0.0/tests/test_negative.py +40 -0
  40. ibidem-1.0.0/tests/test_utils.py +58 -0
  41. ibidem-1.0.0/tests/test_validate.py +87 -0
ibidem-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,29 @@
1
+ Metadata-Version: 2.4
2
+ Name: ibidem
3
+ Version: 1.0.0
4
+ Author-email: Helmut Konrad Schewe <helmutus@outlook.com>
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/anaticulae/ibidem
7
+ Project-URL: Repository, https://github.com/anaticulae/ibidem
8
+ Classifier: Programming Language :: Python :: 3.12
9
+ Classifier: Programming Language :: Python :: 3.13
10
+ Classifier: Programming Language :: Python :: 3.14
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: iamraw<5.0.0,>=4.91.7
14
+ Requires-Dist: utilo<3.0.0,>=2.109.0
15
+ Requires-Dist: configos<2.0.0,>=1.0.4
16
+ Requires-Dist: geostrat<2.0.0,>=1.5.1
17
+ Requires-Dist: konradus<2.0.0,>=1.0.1
18
+ Requires-Dist: elementae<2.0.0,>=1.0.1
19
+ Requires-Dist: germania<2.0.0,>=1.32.0
20
+ Provides-Extra: dev
21
+ Requires-Dist: utilotest==1.0.4; extra == "dev"
22
+ Requires-Dist: hoverpower==1.5.1; extra == "dev"
23
+ Requires-Dist: gennex<2.0.0,>=1.0.3; extra == "dev"
24
+ Requires-Dist: rawmaker==2.40.5; extra == "dev"
25
+ Requires-Dist: pagenumber<2.0.0,>=1.0.0; extra == "dev"
26
+ Requires-Dist: groupmes<2.0.0,>=1.1.0; extra == "dev"
27
+ Requires-Dist: resinf<2.0.0,>=1.0.4; extra == "dev"
28
+
29
+ # footnote
ibidem-1.0.0/README ADDED
@@ -0,0 +1 @@
1
+ # footnote
@@ -0,0 +1,16 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
5
+ # This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ import importlib.metadata
11
+ import os
12
+
13
+ __version__ = importlib.metadata.version('ibidem')
14
+
15
+ ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
16
+ PROCESS = 'footnote'
@@ -0,0 +1,78 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2022 by Helmut Konrad Fahrendholz. All rights reserved.
5
+ # This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ import utilo
11
+
12
+ import ibidem
13
+
14
+ DESCRIPTION = 'TODO'
15
+
16
+ WORKPLAN = [
17
+ utilo.create_step(
18
+ 'plain',
19
+ inputs=[
20
+ utilo.ResultFile(producer='rawmaker', name='text_text'),
21
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
22
+ utilo.ResultFile('rawmaker', name='horizontals_horizontals'),
23
+ ],
24
+ output=('plain',),
25
+ ),
26
+ utilo.create_step(
27
+ 'highnote',
28
+ inputs=[
29
+ utilo.ResultFile(producer='rawmaker', name='text_text'),
30
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
31
+ utilo.ResultFile('rawmaker', name='horizontals_horizontals'),
32
+ ],
33
+ output=('highnote',),
34
+ ),
35
+ utilo.create_step(
36
+ 'result',
37
+ inputs=[
38
+ utilo.ResultFile(producer='footnote', name='highnote_highnote'),
39
+ utilo.ResultFile(producer='footnote', name='plain_plain'),
40
+ ],
41
+ output=('result',),
42
+ ),
43
+ utilo.create_step(
44
+ 'legacy',
45
+ inputs=[
46
+ utilo.ResultFile(producer='footnote', name='result_result'),
47
+ ],
48
+ output=('legacy',),
49
+ ),
50
+ ]
51
+
52
+
53
+ def rename(path):
54
+ if not isinstance(path, str):
55
+ path = [rename(item) for item in path]
56
+ return path
57
+ path = utilo.rreplace(
58
+ path,
59
+ pattern='footnote__legacy_legacy',
60
+ replace='groupme__footer_footerheader',
61
+ )
62
+ return path
63
+
64
+
65
+ def main():
66
+ utilo.featurepack(
67
+ workplan=WORKPLAN,
68
+ root=ibidem.ROOT,
69
+ featurepackage='ibidem.feature',
70
+ config=utilo.FeaturePackConfig(
71
+ description=DESCRIPTION,
72
+ multiprocessed=True,
73
+ name=ibidem.PROCESS,
74
+ pages=True,
75
+ rename=rename,
76
+ version=ibidem.__version__,
77
+ ),
78
+ )
@@ -0,0 +1,15 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
5
+ # This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import configos
11
+ import texmex
12
+
13
+ FOOTER_SEPARATOR_WIDTH_MIN = configos.HV_INT_PLUS(default=70)
14
+
15
+ VISIBLE = texmex.TextState.VISIBLE | texmex.TextState.FOOTNOTE
@@ -0,0 +1,8 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
5
+ # This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
@@ -0,0 +1,42 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
5
+ # This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import serializeraw
11
+
12
+ import ibidem.config
13
+ import ibidem.strategy.highnote
14
+ import ibidem.utils
15
+
16
+
17
+ def work(
18
+ text: str,
19
+ textpositions: str,
20
+ horizontals: str,
21
+ pages=None,
22
+ ) -> str:
23
+ # load
24
+ horizontals = serializeraw.load_horizontals(
25
+ horizontals,
26
+ pages=pages,
27
+ width_min=ibidem.config.FOOTER_SEPARATOR_WIDTH_MIN,
28
+ )
29
+ ptns = serializeraw.ptn_fromfile(
30
+ text,
31
+ textpositions,
32
+ pages=pages,
33
+ state=ibidem.config.VISIBLE,
34
+ )
35
+ ptns = ibidem.utils.rotate_ifrequired(ptns)
36
+ strategy = ibidem.strategy.highnote.HighnoteStrategy(
37
+ horizontals=horizontals,
38
+ ptns=ptns,
39
+ )
40
+ result = strategy.result()
41
+ dumped = serializeraw.dump_headerfooter(result)
42
+ return dumped
@@ -0,0 +1,14 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
5
+ # This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import utilo
11
+
12
+
13
+ def work(xresult) -> str:
14
+ return utilo.file_read(xresult)
@@ -0,0 +1,41 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2022 by Helmut Konrad Fahrendholz. All rights reserved.
5
+ # This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import serializeraw
11
+
12
+ import ibidem.config
13
+ import ibidem.strategy.plainmoving
14
+
15
+
16
+ def work(
17
+ text: str,
18
+ textpositions: str,
19
+ horizontals: str,
20
+ pages=None,
21
+ ) -> str:
22
+ # load
23
+ horizontals = serializeraw.load_horizontals(
24
+ horizontals,
25
+ pages=pages,
26
+ width_min=ibidem.config.FOOTER_SEPARATOR_WIDTH_MIN,
27
+ )
28
+ ptns = serializeraw.ptn_fromfile(
29
+ text,
30
+ textpositions,
31
+ pages=pages,
32
+ state=ibidem.config.VISIBLE,
33
+ )
34
+ ptns = ibidem.utils.rotate_ifrequired(ptns)
35
+ strategy = ibidem.strategy.plainmoving.PlainMovingStrategy(
36
+ horizontals=horizontals,
37
+ ptns=ptns,
38
+ )
39
+ result = strategy.result()
40
+ dumped = serializeraw.dump_headerfooter(result)
41
+ return dumped
@@ -0,0 +1,142 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2022 by Helmut Konrad Fahrendholz. All rights reserved.
5
+ # This file is property of Helmut Konrad Fahrendholz. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Footer Extraction Step
10
+ =============================
11
+
12
+ TODO:
13
+ what should we do with empty header/footer
14
+ """
15
+
16
+ import collections
17
+
18
+ import iamraw
19
+ import serializeraw
20
+ import utilo
21
+
22
+
23
+ def work(
24
+ xhighnote: str,
25
+ xplain: str,
26
+ pages=None,
27
+ ) -> str:
28
+ """Extract footer and header area out of horizontal lines.
29
+
30
+ Returns:
31
+ Dumped list with top and bottom border, which separates the
32
+ content from the footer and or header, for every page
33
+ """
34
+ highnote = serializeraw.load_headerfooter(
35
+ xhighnote,
36
+ pages=pages,
37
+ )
38
+ plain = serializeraw.load_headerfooter(
39
+ xplain,
40
+ pages=pages,
41
+ )
42
+ # select the best one
43
+ result = judge_strategy((
44
+ highnote,
45
+ plain,
46
+ ))
47
+ validate(result)
48
+ # dump
49
+ dumped = serializeraw.dump_headerfooter(result)
50
+ return dumped
51
+
52
+
53
+ def judge_strategy(
54
+ results: list[iamraw.PageContentFooterHeaders],
55
+ ) -> iamraw.PageContentFooterHeaders:
56
+ """Decide which results fits best.
57
+
58
+ Zip result of different strategies. Sometimes there are multiple
59
+ options, therefore we have to use the priorities below.
60
+
61
+ Sources/Concept:
62
+
63
+ - MovingFooter: footer (first prio)
64
+ - Pages: footer (second prio)
65
+ - FixedFooter: header and footer (third prio)
66
+ - Common: header (last prio)
67
+ - PlainMoving:
68
+
69
+ Args:
70
+ results: lists of `ibidem.FootnoteDetectionStrategy`.result
71
+ Returns:
72
+ list of zipped result
73
+ """
74
+ assert results is not None, 'require list of strategy results'
75
+ result = []
76
+ for pdfpage, (
77
+ moving,
78
+ plainmoving,
79
+ ) in utilo.sync_pages(results):
80
+ footer = moving.footer if moving else None
81
+ footer_best = 'moving' if moving else None
82
+ # strategy: moving
83
+ if moving and moving.footer and moving.footer.notes:
84
+ footer = moving.footer
85
+ footer_best = 'moving'
86
+ # strategy: plain
87
+ if not (moving and moving.footer) and plainmoving and plainmoving.footer: # yapf:disable
88
+ # use plain moving only if no other strategy works
89
+ footer = plainmoving.footer
90
+ footer_best = 'plain'
91
+ # log footer best
92
+ if footer_best:
93
+ utilo.verbose(f'footer: {pdfpage} {footer_best}')
94
+ current = iamraw.PageContentFooterHeader(
95
+ footer=footer,
96
+ page=pdfpage,
97
+ )
98
+ result.append(current)
99
+
100
+ page_order = [item.page for item in result]
101
+ assert utilo.isascending(page_order), page_order
102
+ return result
103
+
104
+
105
+ def quality(results: list) -> tuple:
106
+ """Determine quality[0.0, 1.0] of every extraction strategy."""
107
+ # count number of page
108
+ pages = set()
109
+ # count result for every strategy
110
+ counter = collections.defaultdict(int)
111
+ for pdfpage, data in utilo.sync_pages(results):
112
+ pages.add(pdfpage)
113
+ for index, item in enumerate(data):
114
+ if not item:
115
+ continue
116
+ counter[index] += 1
117
+ result = tuple(counter[index] / len(pages) if pages else 0
118
+ for index, _ in enumerate(results))
119
+ return result
120
+
121
+
122
+ def validate(items: list):
123
+ """Validate list of pageable items.
124
+
125
+ If some `page` attribute is duplicated, raise ValueError.
126
+
127
+ Args:
128
+ items(list): list of objects with <page,content>
129
+ Raises:
130
+ ValueError: if some page attribute is duplicated.
131
+ """
132
+ # TODO: REMOVE AFTER UPGRADING IAMRAW
133
+ counter = collections.Counter()
134
+ for item in items:
135
+ counter[item.page] += 1
136
+ msg = []
137
+ for page, value in counter.most_common():
138
+ if value <= 1:
139
+ continue
140
+ msg.append(f'duplicated page: {page} ({value})')
141
+ if msg:
142
+ raise ValueError(utilo.NEWLINE.join(msg))