docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +35 -6
  13. docp/dbs/__init__.py +0 -0
  14. docp/dbs/chroma.py +197 -0
  15. docp/libs/_version.py +1 -0
  16. docp/libs/changelog.py +7 -0
  17. docp/libs/utilities.py +107 -0
  18. docp/loaders/__init__.py +38 -0
  19. docp/loaders/_chromabaseloader.py +338 -0
  20. docp/loaders/_chromabaseloader.py.bak +378 -0
  21. docp/loaders/_chromabasepdfloader.py +121 -0
  22. docp/loaders/_chromabasepptxloader.py +123 -0
  23. docp/loaders/chroma.py.bak +196 -0
  24. docp/loaders/chromapdfloader.py +199 -0
  25. docp/loaders/chromapptxloader.py +192 -0
  26. docp/loaders/lutilities.py +52 -0
  27. docp/objects/__init__.py +0 -0
  28. docp/objects/_docbaseobject.py +65 -0
  29. docp/objects/_imgobject.py +0 -0
  30. docp/objects/_pageobject.py +127 -0
  31. docp/objects/_slideobject.py +110 -0
  32. docp/objects/_tableobject.py +0 -0
  33. docp/objects/_textobject.py +64 -0
  34. docp/objects/pdfobject.py +61 -0
  35. docp/objects/pptxobject.py +46 -0
  36. docp/parsers/__init__.py +0 -0
  37. docp/parsers/_pdfbaseparser.py +236 -0
  38. docp/parsers/_pdftableparser.py +272 -0
  39. docp/parsers/_pdftextparser.py +263 -0
  40. docp/parsers/_pptxbaseparser.py +93 -0
  41. docp/parsers/_pptxtextparser.py +115 -0
  42. docp/parsers/pdfparser.py +62 -0
  43. docp/parsers/pptxparser.py +51 -0
  44. docp/parsers/putilities.py +48 -0
  45. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
  46. docp-0.2.0.dist-info/METADATA +110 -0
  47. docp-0.2.0.dist-info/RECORD +49 -0
  48. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  49. docp/_version.py +0 -1
  50. docp-0.0.0.dev1.dist-info/METADATA +0 -55
  51. docp-0.0.0.dev1.dist-info/RECORD +0 -7
  52. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the generalised base functionality for
5
+ the document-type-specific base classes.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+
16
+ class _DocBase:
17
+ """Private document base class.
18
+
19
+ .. attention::
20
+
21
+ This class is *not* designed to be interacted with directly, but
22
+ rather to be inherited by the document-type-specific document
23
+ objects.
24
+
25
+ """
26
+
27
+ def __init__(self):
28
+ """Base document object class initialiser."""
29
+ self._common = None # Used by the header/footer scanner.
30
+ self._fname = None # Filename (basename)
31
+ self._fpath = None # Full file path
32
+ self._meta = None # Metadata from the document parger
33
+ self._npages = 0 # Number of pages in the document
34
+ self._ntables = 0 # Number of tables extracted
35
+ self._parser = None # Underlying document parser functionality
36
+
37
+ @property
38
+ def basename(self) -> str:
39
+ """Accessor for the file's basename."""
40
+ return self._fname
41
+
42
+ @property
43
+ def filepath(self) -> str:
44
+ """Accessor for the explicit path to this file."""
45
+ return self._fpath
46
+
47
+ @property
48
+ def metadata(self) -> dict | object:
49
+ """The meta data as extracted from the document."""
50
+ return self._meta
51
+
52
+ @property
53
+ def npages(self) -> int:
54
+ """The number of pages successfully extracted from the source."""
55
+ return self._npages
56
+
57
+ @property
58
+ def ntables(self) -> int:
59
+ """The number of tables successfully extracted from the source."""
60
+ return self._ntables
61
+
62
+ @property
63
+ def parser(self) -> object:
64
+ """Accessor to the underlying document parser's functionality."""
65
+ return self._parser
File without changes
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the implementation for the
5
+ ``PageObject`` object.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+
18
+ class PageObject:
19
+ """This class provides the implementation for the ``PageObject``.
20
+
21
+ For each page in a document, an instance of this class is created,
22
+ populated and appended into the document's ``pages`` list attribute.
23
+
24
+ Args:
25
+ content (str, optional): Page content as a single string.
26
+ Defaults to ''.
27
+ pageno (int, optional): Page number. Defaults to 0.
28
+ parser (object, optional): The underlying document parser object.
29
+ Defaults to None.
30
+
31
+ """
32
+
33
+ __slots__ = ('_content', '_hastext', '_pageno', '_parser', '_tables')
34
+
35
+ def __init__(self, content: str='', pageno: int=0, parser: object=None):
36
+ """Page object class initialiser."""
37
+ self._content = content
38
+ self._pageno = pageno
39
+ self._parser = parser
40
+ self._hastext = bool(content)
41
+ self._tables = []
42
+
43
+ def __repr__(self) -> str:
44
+ """Formatted representation of this object."""
45
+ if self._pageno == 0:
46
+ return f'<Page: {self._pageno}; <index offset>>'
47
+ return f'<Page: {self._pageno}; Chars: {len(self._content)}>'
48
+
49
+ def __str__(self) -> str:
50
+ """Formatted string displayed when printing this object."""
51
+ c = self._content[:25].replace('\n', ' ') + ' ...' if self._content else ''
52
+ fmt = (f'Page no: {self._pageno}; '
53
+ f'Content: "{c}"; '
54
+ f'Chars: {len(self._content)}; '
55
+ f'nTables: {len(self._tables)}; '
56
+ f'Parser avail: {bool(self._parser)}')
57
+ return fmt
58
+
59
+ @property
60
+ def content(self) -> str:
61
+ """Accessor to the page's textual content."""
62
+ return self._content
63
+
64
+ @content.setter
65
+ def content(self, value: str) -> None:
66
+ """Setter for the ``content`` attribute.
67
+
68
+ If the ``value`` argument is populated, the content is set and
69
+ the ``hastext`` attribute is set to ``True``.
70
+
71
+ """
72
+ if value:
73
+ self._content = value
74
+ self._hastext = True
75
+
76
+ @property
77
+ def hastext(self) -> bool:
78
+ """Flag indicating if the ``content`` attribute is populated."""
79
+ return self._hastext
80
+
81
+ @property
82
+ def pageno(self) -> int:
83
+ """Accessor to the page number.
84
+
85
+ Note:
86
+ This is the page number with regard to the page's *sequence
87
+ in the overall document*. This is *not* guaranteed to be the
88
+ page's number per the document's page labeling scheme.
89
+
90
+ """
91
+ return self._pageno
92
+
93
+ @property
94
+ def parser(self) -> object:
95
+ """Accessor to the document parser's internal functionality.
96
+
97
+ Note:
98
+ The population of this property is determined by the
99
+ document-type-specific ``docp`` parser. If the underlying
100
+ parsing library has functionality worth preserving and making
101
+ available to the user, it is stored to this property.
102
+ Otherwise, this property will remain as ``None``.
103
+
104
+ """
105
+ return self._parser
106
+
107
+ @property
108
+ def tables(self) -> list:
109
+ """Accessor to the page's tables, if parsed."""
110
+ return self._tables
111
+
112
+ def show(self) -> pdfplumber.display.PageImage: # pylint: disable=undefined-variable # noqa
113
+ """Display the page as an image.
114
+
115
+ Additionally, the return value exposes access to the underlying
116
+ ``pdfplumber`` debugging visualisation methods such as:
117
+
118
+ - :func:`img.debug_tablefinder`
119
+ - :func:`img.draw_*`
120
+ - :func:`img.outline_chars`
121
+ - :func:`img.outline_words`
122
+ - :func:`img.reset`
123
+ - etc.
124
+
125
+
126
+ """
127
+ return self.parser.to_image()
@@ -0,0 +1,110 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the implementation for the
5
+ ``SlideObject`` object.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+
16
+ class SlideObject:
17
+ r"""This class provides the implementation for the ``SlideObject``.
18
+
19
+ For each slide in a document (e.g. PowerPoint), an instance of this
20
+ class is created, populated and appended into the PPTX document's
21
+ ``slides`` list attribute.
22
+
23
+ Args:
24
+ pageno (int, optional): Page number. Defaults to 0.
25
+ parser (object, optional): The underlying document parser object.
26
+ Defaults to None.
27
+
28
+ .. tip::
29
+ To display the textual contents of a slide, simply call the
30
+ following, where 42 is the slide to be displayed::
31
+
32
+ >>> print(*pptx.doc.slides[42].texts, sep='\n\n')
33
+
34
+ """
35
+
36
+ __slots__ = ('_imgs', '_tables', '_texts', '_pageno', '_parser')
37
+
38
+ def __init__(self, pageno: int=0, parser: object=None):
39
+ """Slide object class initialiser."""
40
+ self._imgs = []
41
+ self._tables = []
42
+ self._texts = []
43
+ self._pageno = pageno
44
+ self._parser = parser
45
+
46
+ def __repr__(self) -> str:
47
+ """Formatted representation of this object."""
48
+ return f'<Slide: {self._pageno}>'
49
+
50
+ def __str__(self) -> str:
51
+ """Formatted representation of this object, when printed."""
52
+ if self._pageno == 0:
53
+ return f'<Slide: {self._pageno}; <index offset>>'
54
+ return (f'<Slide: {self._pageno}; '
55
+ f'Text blocks: {len(self._texts)}; '
56
+ f'Tables: {len(self._tables)}; '
57
+ f'Images: {len(self._imgs)}; '
58
+ f'Parser: {bool(self._parser)}>')
59
+
60
+ @property
61
+ def content(self) -> str:
62
+ """Accessor to the textual content of a slide.
63
+
64
+ Returns:
65
+ str: A concatenated string for all text objects found on the
66
+ slide; each object separated by a double-newline.
67
+
68
+ """
69
+ return '\n\n'.join(i.content for i in self._texts)
70
+
71
+ @property
72
+ def images(self) -> list:
73
+ """Accessor to a slide's image objects."""
74
+ return self._imgs
75
+
76
+ @property
77
+ def pageno(self) -> int:
78
+ """Accessor to the page number.
79
+
80
+ Note:
81
+ This is the page number with regard to the page's *sequence
82
+ in the overall document*. This is *not* guaranteed to be the
83
+ page's number per the document's page labeling scheme.
84
+
85
+ """
86
+ return self._pageno
87
+
88
+ @property
89
+ def parser(self) -> object:
90
+ """Accessor to the document parser's internal functionality.
91
+
92
+ Note:
93
+ The population of this property is determined by the
94
+ document-type-specific ``docp`` parser. If the underlying
95
+ parsing library has functionality worth preserving and making
96
+ available to the user, it is stored to this property.
97
+ Otherwise, this property will remain as ``None``.
98
+
99
+ """
100
+ return self._parser
101
+
102
+ @property
103
+ def tables(self) -> list:
104
+ """Accessor to a slide's table objects."""
105
+ return self._tables
106
+
107
+ @property
108
+ def texts(self) -> list:
109
+ """Accessor to a slide's text objects."""
110
+ return self._texts
File without changes
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the implementation for the
5
+ ``TextObject`` object.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+
16
+ class TextObject:
17
+ """This class provides the implementation for the ``TextObject``.
18
+
19
+ For each page (or slide) in a document, an instance of this class is
20
+ created, populated and appended into the page's ``texts`` list
21
+ attribute.
22
+
23
+ Args:
24
+ content (str): Page content as a single string.
25
+
26
+ Note:
27
+ No string cleaning is performed by this class. The string
28
+ contained in the :attr:`contents` attribute is stored exactly as
29
+ extracted from the page or slide's text object.
30
+
31
+ """
32
+
33
+ __slots__ = ('_content', '_hastext')
34
+
35
+ def __init__(self, content: str):
36
+ """Text object class initialiser."""
37
+ self._content = content
38
+ self._hastext = bool(content)
39
+
40
+ def __str__(self) -> str:
41
+ """When printing this object, display the text contents."""
42
+ return self._content
43
+
44
+ @property
45
+ def content(self) -> str:
46
+ """Accessor to the textual content."""
47
+ return self._content
48
+
49
+ @content.setter
50
+ def content(self, value: str) -> None:
51
+ """Setter for the ``content`` attribute.
52
+
53
+ If the ``value`` argument is populated, the content is set and
54
+ the ``hastext`` attribute is set to ``True``.
55
+
56
+ """
57
+ if value:
58
+ self._content = value
59
+ self._hastext = True
60
+
61
+ @property
62
+ def hastext(self) -> bool:
63
+ """Flag indicating if the ``content`` attribute is populated."""
64
+ return self._hastext
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the 'PDF Document' object structure into
5
+ which PDF documents are parsed into for transport and onward
6
+ use.
7
+
8
+ :Platform: Linux/Windows | Python 3.10+
9
+ :Developer: J Berendt
10
+ :Email: development@s3dev.uk
11
+
12
+ :Comments: n/a
13
+
14
+ """
15
+
16
+ try:
17
+ from .objects._docbaseobject import _DocBase
18
+ from .objects._pageobject import PageObject
19
+ except ImportError:
20
+ from objects._docbaseobject import _DocBase
21
+ from objects._pageobject import PageObject
22
+
23
+
24
+ class DocPDF(_DocBase):
25
+ """Container class for storing data parsed from a PDF file."""
26
+
27
+ def __init__(self):
28
+ """PDF document object class initialiser."""
29
+ super().__init__()
30
+ self._tags = False
31
+ # List of PageObjects, offset by 1 to align the index with page numbers.
32
+ self._pages = [PageObject(pageno=0)]
33
+
34
+ @property
35
+ def pages(self) -> list[PageObject]:
36
+ """A list of containing an object for each page in the document.
37
+
38
+ .. tip::
39
+
40
+ The page number index aligns to the page number in the PDF
41
+ file.
42
+
43
+ For example, to access the ``PageObject`` for page 42, use::
44
+
45
+ pages[42]
46
+
47
+ """
48
+ return self._pages
49
+
50
+ @property
51
+ def parsed_using_tags(self) -> bool:
52
+ """Flag indicating if the document was parsed using tags.
53
+
54
+ PDF documents can be created with 'marked content' tags. When
55
+ a PDF document is parsed using tags, as this flag indicates, the
56
+ parser respects columns and other page formatting schemes. If a
57
+ multi-column page is parsed without tags, the parser reads
58
+ straight across the line, thus corrupting the text.
59
+
60
+ """
61
+ return self._tags
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the 'PPTX Document' object structure
5
+ into which MS PowerPoint documents are parsed into for
6
+ transport and onward use.
7
+
8
+ :Platform: Linux/Windows | Python 3.10+
9
+ :Developer: J Berendt
10
+ :Email: development@s3dev.uk
11
+
12
+ :Comments: n/a
13
+
14
+ """
15
+
16
+ try:
17
+ from .objects._docbaseobject import _DocBase
18
+ from .objects._slideobject import SlideObject
19
+ except ImportError:
20
+ from objects._docbaseobject import _DocBase
21
+ from objects._slideobject import SlideObject
22
+
23
+
24
+ class DocPPTX(_DocBase):
25
+ """Container class for storing data parsed from a PPTX file."""
26
+
27
+ def __init__(self):
28
+ """PPTX document object class initialiser."""
29
+ super().__init__()
30
+ self._slides = [SlideObject(pageno=0)]
31
+
32
+ @property
33
+ def slides(self) -> list[SlideObject]:
34
+ """A list of containing an object for each slide in the document.
35
+
36
+ .. tip::
37
+
38
+ The slide number index aligns to the slide number in the
39
+ PPTX file.
40
+
41
+ For example, to access the ``SlideObject`` for side 42, use::
42
+
43
+ slides[42]
44
+
45
+ """
46
+ return self._slides
File without changes