docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
- docp/__init__.py +35 -6
- docp/dbs/__init__.py +0 -0
- docp/dbs/chroma.py +197 -0
- docp/libs/_version.py +1 -0
- docp/libs/changelog.py +7 -0
- docp/libs/utilities.py +107 -0
- docp/loaders/__init__.py +38 -0
- docp/loaders/_chromabaseloader.py +338 -0
- docp/loaders/_chromabaseloader.py.bak +378 -0
- docp/loaders/_chromabasepdfloader.py +121 -0
- docp/loaders/_chromabasepptxloader.py +123 -0
- docp/loaders/chroma.py.bak +196 -0
- docp/loaders/chromapdfloader.py +199 -0
- docp/loaders/chromapptxloader.py +192 -0
- docp/loaders/lutilities.py +52 -0
- docp/objects/__init__.py +0 -0
- docp/objects/_docbaseobject.py +65 -0
- docp/objects/_imgobject.py +0 -0
- docp/objects/_pageobject.py +127 -0
- docp/objects/_slideobject.py +110 -0
- docp/objects/_tableobject.py +0 -0
- docp/objects/_textobject.py +64 -0
- docp/objects/pdfobject.py +61 -0
- docp/objects/pptxobject.py +46 -0
- docp/parsers/__init__.py +0 -0
- docp/parsers/_pdfbaseparser.py +236 -0
- docp/parsers/_pdftableparser.py +272 -0
- docp/parsers/_pdftextparser.py +263 -0
- docp/parsers/_pptxbaseparser.py +93 -0
- docp/parsers/_pptxtextparser.py +115 -0
- docp/parsers/pdfparser.py +62 -0
- docp/parsers/pptxparser.py +51 -0
- docp/parsers/putilities.py +48 -0
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
- docp-0.2.0.dist-info/METADATA +110 -0
- docp-0.2.0.dist-info/RECORD +49 -0
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
- docp/_version.py +0 -1
- docp-0.0.0.dev1.dist-info/METADATA +0 -55
- docp-0.0.0.dev1.dist-info/RECORD +0 -7
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the generalised base functionality for
|
5
|
+
the document-type-specific base classes.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
|
16
|
+
class _DocBase:
|
17
|
+
"""Private document base class.
|
18
|
+
|
19
|
+
.. attention::
|
20
|
+
|
21
|
+
This class is *not* designed to be interacted with directly, but
|
22
|
+
rather to be inherited by the document-type-specific document
|
23
|
+
objects.
|
24
|
+
|
25
|
+
"""
|
26
|
+
|
27
|
+
def __init__(self):
|
28
|
+
"""Base document object class initialiser."""
|
29
|
+
self._common = None # Used by the header/footer scanner.
|
30
|
+
self._fname = None # Filename (basename)
|
31
|
+
self._fpath = None # Full file path
|
32
|
+
self._meta = None # Metadata from the document parger
|
33
|
+
self._npages = 0 # Number of pages in the document
|
34
|
+
self._ntables = 0 # Number of tables extracted
|
35
|
+
self._parser = None # Underlying document parser functionality
|
36
|
+
|
37
|
+
@property
|
38
|
+
def basename(self) -> str:
|
39
|
+
"""Accessor for the file's basename."""
|
40
|
+
return self._fname
|
41
|
+
|
42
|
+
@property
|
43
|
+
def filepath(self) -> str:
|
44
|
+
"""Accessor for the explicit path to this file."""
|
45
|
+
return self._fpath
|
46
|
+
|
47
|
+
@property
|
48
|
+
def metadata(self) -> dict | object:
|
49
|
+
"""The meta data as extracted from the document."""
|
50
|
+
return self._meta
|
51
|
+
|
52
|
+
@property
|
53
|
+
def npages(self) -> int:
|
54
|
+
"""The number of pages successfully extracted from the source."""
|
55
|
+
return self._npages
|
56
|
+
|
57
|
+
@property
|
58
|
+
def ntables(self) -> int:
|
59
|
+
"""The number of tables successfully extracted from the source."""
|
60
|
+
return self._ntables
|
61
|
+
|
62
|
+
@property
|
63
|
+
def parser(self) -> object:
|
64
|
+
"""Accessor to the underlying document parser's functionality."""
|
65
|
+
return self._parser
|
File without changes
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the implementation for the
|
5
|
+
``PageObject`` object.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
|
18
|
+
class PageObject:
|
19
|
+
"""This class provides the implementation for the ``PageObject``.
|
20
|
+
|
21
|
+
For each page in a document, an instance of this class is created,
|
22
|
+
populated and appended into the document's ``pages`` list attribute.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
content (str, optional): Page content as a single string.
|
26
|
+
Defaults to ''.
|
27
|
+
pageno (int, optional): Page number. Defaults to 0.
|
28
|
+
parser (object, optional): The underlying document parser object.
|
29
|
+
Defaults to None.
|
30
|
+
|
31
|
+
"""
|
32
|
+
|
33
|
+
__slots__ = ('_content', '_hastext', '_pageno', '_parser', '_tables')
|
34
|
+
|
35
|
+
def __init__(self, content: str='', pageno: int=0, parser: object=None):
|
36
|
+
"""Page object class initialiser."""
|
37
|
+
self._content = content
|
38
|
+
self._pageno = pageno
|
39
|
+
self._parser = parser
|
40
|
+
self._hastext = bool(content)
|
41
|
+
self._tables = []
|
42
|
+
|
43
|
+
def __repr__(self) -> str:
|
44
|
+
"""Formatted representation of this object."""
|
45
|
+
if self._pageno == 0:
|
46
|
+
return f'<Page: {self._pageno}; <index offset>>'
|
47
|
+
return f'<Page: {self._pageno}; Chars: {len(self._content)}>'
|
48
|
+
|
49
|
+
def __str__(self) -> str:
|
50
|
+
"""Formatted string displayed when printing this object."""
|
51
|
+
c = self._content[:25].replace('\n', ' ') + ' ...' if self._content else ''
|
52
|
+
fmt = (f'Page no: {self._pageno}; '
|
53
|
+
f'Content: "{c}"; '
|
54
|
+
f'Chars: {len(self._content)}; '
|
55
|
+
f'nTables: {len(self._tables)}; '
|
56
|
+
f'Parser avail: {bool(self._parser)}')
|
57
|
+
return fmt
|
58
|
+
|
59
|
+
@property
|
60
|
+
def content(self) -> str:
|
61
|
+
"""Accessor to the page's textual content."""
|
62
|
+
return self._content
|
63
|
+
|
64
|
+
@content.setter
|
65
|
+
def content(self, value: str) -> None:
|
66
|
+
"""Setter for the ``content`` attribute.
|
67
|
+
|
68
|
+
If the ``value`` argument is populated, the content is set and
|
69
|
+
the ``hastext`` attribute is set to ``True``.
|
70
|
+
|
71
|
+
"""
|
72
|
+
if value:
|
73
|
+
self._content = value
|
74
|
+
self._hastext = True
|
75
|
+
|
76
|
+
@property
|
77
|
+
def hastext(self) -> bool:
|
78
|
+
"""Flag indicating if the ``content`` attribute is populated."""
|
79
|
+
return self._hastext
|
80
|
+
|
81
|
+
@property
|
82
|
+
def pageno(self) -> int:
|
83
|
+
"""Accessor to the page number.
|
84
|
+
|
85
|
+
Note:
|
86
|
+
This is the page number with regard to the page's *sequence
|
87
|
+
in the overall document*. This is *not* guaranteed to be the
|
88
|
+
page's number per the document's page labeling scheme.
|
89
|
+
|
90
|
+
"""
|
91
|
+
return self._pageno
|
92
|
+
|
93
|
+
@property
|
94
|
+
def parser(self) -> object:
|
95
|
+
"""Accessor to the document parser's internal functionality.
|
96
|
+
|
97
|
+
Note:
|
98
|
+
The population of this property is determined by the
|
99
|
+
document-type-specific ``docp`` parser. If the underlying
|
100
|
+
parsing library has functionality worth preserving and making
|
101
|
+
available to the user, it is stored to this property.
|
102
|
+
Otherwise, this property will remain as ``None``.
|
103
|
+
|
104
|
+
"""
|
105
|
+
return self._parser
|
106
|
+
|
107
|
+
@property
|
108
|
+
def tables(self) -> list:
|
109
|
+
"""Accessor to the page's tables, if parsed."""
|
110
|
+
return self._tables
|
111
|
+
|
112
|
+
def show(self) -> pdfplumber.display.PageImage: # pylint: disable=undefined-variable # noqa
|
113
|
+
"""Display the page as an image.
|
114
|
+
|
115
|
+
Additionally, the return value exposes access to the underlying
|
116
|
+
``pdfplumber`` debugging visualisation methods such as:
|
117
|
+
|
118
|
+
- :func:`img.debug_tablefinder`
|
119
|
+
- :func:`img.draw_*`
|
120
|
+
- :func:`img.outline_chars`
|
121
|
+
- :func:`img.outline_words`
|
122
|
+
- :func:`img.reset`
|
123
|
+
- etc.
|
124
|
+
|
125
|
+
|
126
|
+
"""
|
127
|
+
return self.parser.to_image()
|
@@ -0,0 +1,110 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the implementation for the
|
5
|
+
``SlideObject`` object.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
|
16
|
+
class SlideObject:
|
17
|
+
r"""This class provides the implementation for the ``SlideObject``.
|
18
|
+
|
19
|
+
For each slide in a document (e.g. PowerPoint), an instance of this
|
20
|
+
class is created, populated and appended into the PPTX document's
|
21
|
+
``slides`` list attribute.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
pageno (int, optional): Page number. Defaults to 0.
|
25
|
+
parser (object, optional): The underlying document parser object.
|
26
|
+
Defaults to None.
|
27
|
+
|
28
|
+
.. tip::
|
29
|
+
To display the textual contents of a slide, simply call the
|
30
|
+
following, where 42 is the slide to be displayed::
|
31
|
+
|
32
|
+
>>> print(*pptx.doc.slides[42].texts, sep='\n\n')
|
33
|
+
|
34
|
+
"""
|
35
|
+
|
36
|
+
__slots__ = ('_imgs', '_tables', '_texts', '_pageno', '_parser')
|
37
|
+
|
38
|
+
def __init__(self, pageno: int=0, parser: object=None):
|
39
|
+
"""Slide object class initialiser."""
|
40
|
+
self._imgs = []
|
41
|
+
self._tables = []
|
42
|
+
self._texts = []
|
43
|
+
self._pageno = pageno
|
44
|
+
self._parser = parser
|
45
|
+
|
46
|
+
def __repr__(self) -> str:
|
47
|
+
"""Formatted representation of this object."""
|
48
|
+
return f'<Slide: {self._pageno}>'
|
49
|
+
|
50
|
+
def __str__(self) -> str:
|
51
|
+
"""Formatted representation of this object, when printed."""
|
52
|
+
if self._pageno == 0:
|
53
|
+
return f'<Slide: {self._pageno}; <index offset>>'
|
54
|
+
return (f'<Slide: {self._pageno}; '
|
55
|
+
f'Text blocks: {len(self._texts)}; '
|
56
|
+
f'Tables: {len(self._tables)}; '
|
57
|
+
f'Images: {len(self._imgs)}; '
|
58
|
+
f'Parser: {bool(self._parser)}>')
|
59
|
+
|
60
|
+
@property
|
61
|
+
def content(self) -> str:
|
62
|
+
"""Accessor to the textual content of a slide.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
str: A concatenated string for all text objects found on the
|
66
|
+
slide; each object separated by a double-newline.
|
67
|
+
|
68
|
+
"""
|
69
|
+
return '\n\n'.join(i.content for i in self._texts)
|
70
|
+
|
71
|
+
@property
|
72
|
+
def images(self) -> list:
|
73
|
+
"""Accessor to a slide's image objects."""
|
74
|
+
return self._imgs
|
75
|
+
|
76
|
+
@property
|
77
|
+
def pageno(self) -> int:
|
78
|
+
"""Accessor to the page number.
|
79
|
+
|
80
|
+
Note:
|
81
|
+
This is the page number with regard to the page's *sequence
|
82
|
+
in the overall document*. This is *not* guaranteed to be the
|
83
|
+
page's number per the document's page labeling scheme.
|
84
|
+
|
85
|
+
"""
|
86
|
+
return self._pageno
|
87
|
+
|
88
|
+
@property
|
89
|
+
def parser(self) -> object:
|
90
|
+
"""Accessor to the document parser's internal functionality.
|
91
|
+
|
92
|
+
Note:
|
93
|
+
The population of this property is determined by the
|
94
|
+
document-type-specific ``docp`` parser. If the underlying
|
95
|
+
parsing library has functionality worth preserving and making
|
96
|
+
available to the user, it is stored to this property.
|
97
|
+
Otherwise, this property will remain as ``None``.
|
98
|
+
|
99
|
+
"""
|
100
|
+
return self._parser
|
101
|
+
|
102
|
+
@property
|
103
|
+
def tables(self) -> list:
|
104
|
+
"""Accessor to a slide's table objects."""
|
105
|
+
return self._tables
|
106
|
+
|
107
|
+
@property
|
108
|
+
def texts(self) -> list:
|
109
|
+
"""Accessor to a slide's text objects."""
|
110
|
+
return self._texts
|
File without changes
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the implementation for the
|
5
|
+
``TextObject`` object.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
|
16
|
+
class TextObject:
|
17
|
+
"""This class provides the implementation for the ``TextObject``.
|
18
|
+
|
19
|
+
For each page (or slide) in a document, an instance of this class is
|
20
|
+
created, populated and appended into the page's ``texts`` list
|
21
|
+
attribute.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
content (str): Page content as a single string.
|
25
|
+
|
26
|
+
Note:
|
27
|
+
No string cleaning is performed by this class. The string
|
28
|
+
contained in the :attr:`contents` attribute is stored exactly as
|
29
|
+
extracted from the page or slide's text object.
|
30
|
+
|
31
|
+
"""
|
32
|
+
|
33
|
+
__slots__ = ('_content', '_hastext')
|
34
|
+
|
35
|
+
def __init__(self, content: str):
|
36
|
+
"""Text object class initialiser."""
|
37
|
+
self._content = content
|
38
|
+
self._hastext = bool(content)
|
39
|
+
|
40
|
+
def __str__(self) -> str:
|
41
|
+
"""When printing this object, display the text contents."""
|
42
|
+
return self._content
|
43
|
+
|
44
|
+
@property
|
45
|
+
def content(self) -> str:
|
46
|
+
"""Accessor to the textual content."""
|
47
|
+
return self._content
|
48
|
+
|
49
|
+
@content.setter
|
50
|
+
def content(self, value: str) -> None:
|
51
|
+
"""Setter for the ``content`` attribute.
|
52
|
+
|
53
|
+
If the ``value`` argument is populated, the content is set and
|
54
|
+
the ``hastext`` attribute is set to ``True``.
|
55
|
+
|
56
|
+
"""
|
57
|
+
if value:
|
58
|
+
self._content = value
|
59
|
+
self._hastext = True
|
60
|
+
|
61
|
+
@property
|
62
|
+
def hastext(self) -> bool:
|
63
|
+
"""Flag indicating if the ``content`` attribute is populated."""
|
64
|
+
return self._hastext
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the 'PDF Document' object structure into
|
5
|
+
which PDF documents are parsed into for transport and onward
|
6
|
+
use.
|
7
|
+
|
8
|
+
:Platform: Linux/Windows | Python 3.10+
|
9
|
+
:Developer: J Berendt
|
10
|
+
:Email: development@s3dev.uk
|
11
|
+
|
12
|
+
:Comments: n/a
|
13
|
+
|
14
|
+
"""
|
15
|
+
|
16
|
+
try:
|
17
|
+
from .objects._docbaseobject import _DocBase
|
18
|
+
from .objects._pageobject import PageObject
|
19
|
+
except ImportError:
|
20
|
+
from objects._docbaseobject import _DocBase
|
21
|
+
from objects._pageobject import PageObject
|
22
|
+
|
23
|
+
|
24
|
+
class DocPDF(_DocBase):
|
25
|
+
"""Container class for storing data parsed from a PDF file."""
|
26
|
+
|
27
|
+
def __init__(self):
|
28
|
+
"""PDF document object class initialiser."""
|
29
|
+
super().__init__()
|
30
|
+
self._tags = False
|
31
|
+
# List of PageObjects, offset by 1 to align the index with page numbers.
|
32
|
+
self._pages = [PageObject(pageno=0)]
|
33
|
+
|
34
|
+
@property
|
35
|
+
def pages(self) -> list[PageObject]:
|
36
|
+
"""A list of containing an object for each page in the document.
|
37
|
+
|
38
|
+
.. tip::
|
39
|
+
|
40
|
+
The page number index aligns to the page number in the PDF
|
41
|
+
file.
|
42
|
+
|
43
|
+
For example, to access the ``PageObject`` for page 42, use::
|
44
|
+
|
45
|
+
pages[42]
|
46
|
+
|
47
|
+
"""
|
48
|
+
return self._pages
|
49
|
+
|
50
|
+
@property
|
51
|
+
def parsed_using_tags(self) -> bool:
|
52
|
+
"""Flag indicating if the document was parsed using tags.
|
53
|
+
|
54
|
+
PDF documents can be created with 'marked content' tags. When
|
55
|
+
a PDF document is parsed using tags, as this flag indicates, the
|
56
|
+
parser respects columns and other page formatting schemes. If a
|
57
|
+
multi-column page is parsed without tags, the parser reads
|
58
|
+
straight across the line, thus corrupting the text.
|
59
|
+
|
60
|
+
"""
|
61
|
+
return self._tags
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the 'PPTX Document' object structure
|
5
|
+
into which MS PowerPoint documents are parsed into for
|
6
|
+
transport and onward use.
|
7
|
+
|
8
|
+
:Platform: Linux/Windows | Python 3.10+
|
9
|
+
:Developer: J Berendt
|
10
|
+
:Email: development@s3dev.uk
|
11
|
+
|
12
|
+
:Comments: n/a
|
13
|
+
|
14
|
+
"""
|
15
|
+
|
16
|
+
try:
|
17
|
+
from .objects._docbaseobject import _DocBase
|
18
|
+
from .objects._slideobject import SlideObject
|
19
|
+
except ImportError:
|
20
|
+
from objects._docbaseobject import _DocBase
|
21
|
+
from objects._slideobject import SlideObject
|
22
|
+
|
23
|
+
|
24
|
+
class DocPPTX(_DocBase):
|
25
|
+
"""Container class for storing data parsed from a PPTX file."""
|
26
|
+
|
27
|
+
def __init__(self):
|
28
|
+
"""PPTX document object class initialiser."""
|
29
|
+
super().__init__()
|
30
|
+
self._slides = [SlideObject(pageno=0)]
|
31
|
+
|
32
|
+
@property
|
33
|
+
def slides(self) -> list[SlideObject]:
|
34
|
+
"""A list of containing an object for each slide in the document.
|
35
|
+
|
36
|
+
.. tip::
|
37
|
+
|
38
|
+
The slide number index aligns to the slide number in the
|
39
|
+
PPTX file.
|
40
|
+
|
41
|
+
For example, to access the ``SlideObject`` for side 42, use::
|
42
|
+
|
43
|
+
slides[42]
|
44
|
+
|
45
|
+
"""
|
46
|
+
return self._slides
|
docp/parsers/__init__.py
ADDED
File without changes
|