docp 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +19 -10
  13. docp/dbs/chroma.py +19 -6
  14. docp/libs/_version.py +1 -0
  15. docp/libs/changelog.py +7 -0
  16. docp/libs/utilities.py +107 -0
  17. docp/loaders/__init__.py +38 -0
  18. docp/loaders/_chromabaseloader.py +83 -107
  19. docp/loaders/_chromabaseloader.py.bak +378 -0
  20. docp/loaders/_chromabasepdfloader.py +121 -0
  21. docp/loaders/_chromabasepptxloader.py +123 -0
  22. docp/loaders/{chroma.py → chroma.py.bak} +38 -8
  23. docp/loaders/chromapdfloader.py +199 -0
  24. docp/loaders/chromapptxloader.py +192 -0
  25. docp/loaders/lutilities.py +52 -0
  26. docp/objects/_docbaseobject.py +7 -18
  27. docp/objects/_imgobject.py +0 -0
  28. docp/objects/_pageobject.py +3 -2
  29. docp/objects/_slideobject.py +110 -0
  30. docp/objects/_textobject.py +64 -0
  31. docp/objects/pdfobject.py +24 -2
  32. docp/objects/pptxobject.py +46 -0
  33. docp/parsers/_pdfbaseparser.py +36 -10
  34. docp/parsers/_pdftableparser.py +6 -7
  35. docp/parsers/_pdftextparser.py +23 -13
  36. docp/parsers/_pptxbaseparser.py +93 -0
  37. docp/parsers/_pptxtextparser.py +115 -0
  38. docp/parsers/pptxparser.py +51 -0
  39. docp/parsers/putilities.py +48 -0
  40. docp-0.2.0.dist-info/METADATA +110 -0
  41. docp-0.2.0.dist-info/RECORD +49 -0
  42. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  43. docp/_version.py +0 -1
  44. docp-0.1.0b1.dist-info/METADATA +0 -55
  45. docp-0.1.0b1.dist-info/RECORD +0 -23
  46. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/LICENSE +0 -0
  47. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,199 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the entry point for loading PDF files
5
+ into a Chroma database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Examples:
14
+
15
+ Parse and load a *single* PDF file into a Chroma database
16
+ collection::
17
+
18
+ >>> from docp.loaders import ChromaPDFLoader
19
+
20
+ >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
21
+ collection='spam')
22
+ >>> l.load(path='/path/to/directory/myfile.pdf')
23
+
24
+
25
+ Parse and load a *directory* of PDF files into a Chroma database
26
+ collection::
27
+
28
+ >>> from docp.loaders import ChromaPDFLoader
29
+
30
+ >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
31
+ collection='spam')
32
+ >>> l.load(path='/path/to/directory', ext='pdf')
33
+
34
+
35
+ For further example code use, please refer to the
36
+ :class:`ChromaPDFLoader` class docstring.
37
+
38
+ """
39
+
40
+ import os
41
+ # locals
42
+ try:
43
+ from .libs.utilities import utilities
44
+ from .loaders._chromabasepdfloader import _ChromaBasePDFLoader
45
+ except ImportError:
46
+ from libs.utilities import utilities
47
+ from loaders._chromabasepdfloader import _ChromaBasePDFLoader
48
+
49
+
50
+ class ChromaPDFLoader(_ChromaBasePDFLoader):
51
+ """Chroma database PDF-specific document loader.
52
+
53
+ Args:
54
+ dbpath (str | ChromaDB): Either the full path to the Chroma
55
+ database *directory*, or an instance of a
56
+ :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
57
+ passed, the ``collection`` argument is ignored.
58
+ collection (str, optional): Name of the Chroma database
59
+ collection. Only required if the ``dbpath`` parameter is a
60
+ path. Defaults to None.
61
+ split_text (bool, optional): Split the document into chunks,
62
+ before loading it into the database. Defaults to True.
63
+ load_keywords (bool, optional): Use an LLM to derive keywords
64
+ from the document and load these keywords into the sister
65
+ keywords collection. Defaults to False.
66
+ llm (object, optional): If deriving keywords, this is the LLM
67
+ which will do the derivation. Defaults to None.
68
+ offline (bool, optional): Remain offline and use the locally
69
+ cached embedding function model. Defaults to False.
70
+
71
+ .. important::
72
+
73
+ The *deriving and loading of keywords* is only recommended for
74
+ **GPU-bound processing** as the LLM is invoked to infer the
75
+ keywords for each given document.
76
+
77
+ If called on a 'standard' PC, this will take a *long* time to
78
+ complete, if it completes at all.
79
+
80
+ :Examples:
81
+
82
+ Parse and load a *single* PDF file into a Chroma database
83
+ collection::
84
+
85
+ >>> from docp.loaders import ChromaPDFLoader
86
+
87
+ >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
88
+ collection='spam')
89
+ >>> l.load(path='/path/to/directory/myfile.pdf')
90
+
91
+
92
+ Parse and load a *directory* of PDF files into a Chroma
93
+ database collection::
94
+
95
+ >>> from docp.loaders import ChromaPDFLoader
96
+
97
+ >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
98
+ collection='spam')
99
+ >>> l.load(path='/path/to/directory', ext='pdf')
100
+
101
+ """
102
+
103
+ #
104
+ # No __init__ method here to ensure the ultimate base class'
105
+ # signature is used and to save passing loads of stuff around, if we
106
+ # don't have to.
107
+ #
108
+
109
+ def load(self,
110
+ path: str,
111
+ *,
112
+ ext: str='**',
113
+ recursive: bool=True,
114
+ remove_header: bool=True,
115
+ remove_footer: bool=True,
116
+ remove_newlines: bool=True,
117
+ ignore_tags: set=None,
118
+ convert_to_ascii: bool=True,
119
+ **unused) -> None:
120
+ """Load a PDF file (or files) into a Chroma database.
121
+
122
+ Args:
123
+ path (str): Full path to the file (or *directory*) to be
124
+ parsed and loaded. Note: If this is a directory, a
125
+ specific file extension can be passed into the
126
+ :meth:`load` method using the ``ext`` argument.
127
+ ext (str, optional): If the ``path`` argument refers to a
128
+ *directory*, a specific file extension can be specified
129
+ here. For example: ``ext = 'pdf'``.
130
+
131
+ If anything other than ``'**'`` is provided, all
132
+ alpha-characters are parsed from the string, and prefixed
133
+ with ``*.``. Meaning, if ``'.pdf'`` is passed, the
134
+ characters ``'pdf'`` are parsed and prefixed with ``*.``
135
+ to create ``'*.pdf'``. However, if ``'things.foo'`` is
136
+ passed, the derived extension will be ``'*.thingsfoo'``.
137
+ Defaults to '**', for a recursive search.
138
+
139
+ recursive (bool, optional): If True, subdirectories are
140
+ searched. Defaults to True.
141
+ remove_header (bool, optional): Attempt to remove the header
142
+ from each page. Defaults to True.
143
+ remove_footer (bool, optional): Attempt to remove the footer
144
+ from each page. Defaults to True.
145
+ remove_newlines (bool, optional): Replace newline characters
146
+ with a space. Defaults to True, as this helps with
147
+ document chunk splitting.
148
+ ignore_tags (set, optional): If provided, these are the
149
+ PDF 'marked content' tags which will be ignored. Note
150
+ that the PDF document must contain tags, otherwise the
151
+ bounding box method is used and this argument is ignored.
152
+ Defaults to ``{'Artifact'}``, as these generally
153
+ relate to a header and/or footer. To include all tags,
154
+ (not skip any) pass this argument as ``'na'``.
155
+ convert_to_ascii (bool, optional): Convert all characters to
156
+ ASCII. Defaults to True.
157
+
158
+ :Keyword Args:
159
+ unused (dict): This enables keywords to be passed into a
160
+ loader-agnostic ``.load()`` function without raising a
161
+ 'unexpected keyword argument` ``TypeError``.
162
+
163
+ """
164
+ # pylint: disable=unused-argument # They are 'used' via locals().
165
+ # Prepare the arguments being sent to the doc parser.
166
+ kwargs = self._set_kwargs(locals_=locals())
167
+ # Load multi
168
+ if os.path.isdir(path):
169
+ files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
170
+ count = len(files)
171
+ for idx, f in enumerate(files, 1):
172
+ print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
173
+ self._load(path=f, **kwargs)
174
+ # Load single
175
+ else:
176
+ print(f'Processing: {os.path.basename(path)} ...')
177
+ self._load(path=path, **kwargs)
178
+
179
+ @staticmethod
180
+ def _set_kwargs(locals_: dict) -> dict:
181
+ r"""Prepare the arguments which are sent to the doc parser.
182
+
183
+ As :func:`locals()` is used to capture the :meth:`load` method's
184
+ arguments for passing into the doc parser, some argument must be
185
+ removed first.
186
+
187
+ Args:
188
+ locals\_ (dict): The return value from a :func:`locals` call.
189
+
190
+ Returns:
191
+ dict: A *copy* of the provided dictionary with specific
192
+ key/value pairs removed.
193
+
194
+ """
195
+ # ^^^ The backslash in locals\_ is required for documentation to render correctly.
196
+ kwargs = locals_.copy()
197
+ for k in ['self', 'path']:
198
+ kwargs.pop(k)
199
+ return kwargs
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the entry point for loading PPTX files
5
+ into a Chroma database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Examples:
14
+
15
+ Parse and load a *single* PPTX file into a Chroma database
16
+ collection::
17
+
18
+ >>> from docp.loaders import ChromaPPTXLoader
19
+
20
+ >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
21
+ collection='spam',
22
+ split_text=False)
23
+ >>> l.load(path='/path/to/directory/myfile.pptx')
24
+
25
+
26
+ Parse and load a *directory* of PPTX files into a Chroma database
27
+ collection::
28
+
29
+ >>> from docp.loaders import ChromaPPTXLoader
30
+
31
+ >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
32
+ collection='spam',
33
+ split_text=False)
34
+ >>> l.load(path='/path/to/directory', ext='pptx')
35
+
36
+
37
+ For further example code use, please refer to the
38
+ :class:`ChromaPPTXLoader` class docstring.
39
+
40
+ """
41
+
42
+ import os
43
+ # locals
44
+ try:
45
+ from .libs.utilities import utilities
46
+ from .loaders._chromabasepptxloader import _ChromaBasePPTXLoader
47
+ except ImportError:
48
+ from libs.utilities import utilities
49
+ from loaders._chromabasepptxloader import _ChromaBasePPTXLoader
50
+
51
+
52
+ class ChromaPPTXLoader(_ChromaBasePPTXLoader):
53
+ """Chroma database PPTX-specific document loader.
54
+
55
+ Args:
56
+ dbpath (str | ChromaDB): Either the full path to the Chroma
57
+ database *directory*, or an instance of a
58
+ :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
59
+ passed, the ``collection`` argument is ignored.
60
+ collection (str, optional): Name of the Chroma database
61
+ collection. Only required if the ``db`` parameter is a path.
62
+ Defaults to None.
63
+ split_text (bool, optional): Split the document into chunks,
64
+ before loading it into the database. Defaults to True.
65
+ load_keywords (bool, optional): Derive keywords from the document
66
+ and load these into the sister keywords collection.
67
+ Defaults to False.
68
+ llm (object, optional): If deriving keywords, this is the LLM
69
+ which will do the derivation. Defaults to None.
70
+ offline (bool, optional): Remain offline and use the locally
71
+ cached embedding function model. Defaults to False.
72
+
73
+ .. important::
74
+
75
+ The *deriving and loading of keywords* is only recommended for
76
+ **GPU-bound processing**, as the LLM is invoked to infer the
77
+ keywords for each given document.
78
+
79
+ If called on a 'standard' PC, this will take a *long* time to
80
+ complete, if it completes at all.
81
+
82
+ .. tip::
83
+
84
+ It is recommended to pass ``split_text=False`` into the
85
+ :class:`ChromaPPTXLoader` constructor.
86
+
87
+ Often, PowerPoint presentations are structured such that related
88
+ text is found in the same 'shape' (textbox) on a slide.
89
+ Splitting the text in these shapes may have undesired results.
90
+
91
+ :Examples:
92
+
93
+ Parse and load a *single* PPTX file into a Chroma database
94
+ collection::
95
+
96
+ >>> from docp.loaders import ChromaPPTXLoader
97
+
98
+ >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
99
+ collection='spam',
100
+ split_text=False) # <-- Note this
101
+ >>> l.load(path='/path/to/directory/myfile.pptx')
102
+
103
+
104
+ Parse and load a *directory* of PPTX files into a Chroma database
105
+ collection::
106
+
107
+ >>> from docp.loaders import ChromaPPTXLoader
108
+
109
+ >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
110
+ collection='spam',
111
+ split_text=False) # <-- Note this
112
+ >>> l.load(path='/path/to/directory', ext='pptx')
113
+
114
+ """
115
+ def load(self,
116
+ path: str,
117
+ *,
118
+ ext: str='**',
119
+ recursive: bool=True,
120
+ remove_newlines: bool=True,
121
+ convert_to_ascii: bool=True,
122
+ **unused) -> None:
123
+ """Load a PDF file (or files) into a Chroma database.
124
+
125
+ Args:
126
+ path (str): Full path to the file (or *directory*) to be
127
+ parsed and loaded. Note: If this is a directory, a
128
+ specific file extension can be passed into the
129
+ :meth:`load` method using the ``ext`` argument.
130
+ ext (str, optional): If the ``path`` argument refers to a
131
+ *directory*, a specific file extension can be specified
132
+ here. For example: ``ext = 'pptx'``.
133
+
134
+ If anything other than ``'**'`` is provided, all
135
+ alpha-characters are parsed from the string, and prefixed
136
+ with ``*.``. Meaning, if ``'.pptx'`` is passed, the
137
+ characters ``'pptx'`` are parsed and prefixed with ``*.``
138
+ to create ``'*.pptx'``. However, if ``'things.foo'`` is
139
+ passed, the derived extension will be ``'*.thingsfoo'``.
140
+ Defaults to '**', for a recursive search.
141
+
142
+ recursive (bool, optional): If True, subdirectories are
143
+ searched. Defaults to True.
144
+ remove_newlines (bool, optional): Replace newline characters
145
+ with a space. Defaults to True, as this helps with
146
+ document chunk splitting.
147
+ convert_to_ascii (bool, optional): Convert all characters to
148
+ ASCII. Defaults to True.
149
+
150
+ :Keyword Args:
151
+ unused (dict): This enables keywords such as ``remove_header``
152
+ and ``remove_footer`` (for example) to be passed into a
153
+ loader-agnostic ``.load()`` function without raising a
154
+ 'unexpected keyword argument` ``TypeError``.
155
+
156
+ """
157
+ # pylint: disable=unused-argument # They are 'used' via locals().
158
+ # Prepare the arguments being sent to the doc parser.
159
+ kwargs = self._set_kwargs(locals_=locals())
160
+ # Load multi
161
+ if os.path.isdir(path):
162
+ files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
163
+ count = len(files)
164
+ for idx, f in enumerate(files, 1):
165
+ print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
166
+ self._load(path=f, **kwargs)
167
+ # Load single
168
+ else:
169
+ print(f'Processing: {os.path.basename(path)} ...')
170
+ self._load(path=path, **kwargs)
171
+
172
+ @staticmethod
173
+ def _set_kwargs(locals_: dict) -> dict:
174
+ r"""Prepare the arguments which are sent to the doc parser.
175
+
176
+ As :func:`locals()` is used to capture the :meth:`load` method's
177
+ arguments for passing into the doc parser, some argument must be
178
+ removed first.
179
+
180
+ Args:
181
+ locals\_ (dict): The return value from a :func:`locals` call.
182
+
183
+ Returns:
184
+ dict: A *copy* of the provided dictionary with specific
185
+ key/value pairs removed.
186
+
187
+ """
188
+ # ^^^ The backslash in locals\_ is required for documentation to render correctly.
189
+ kwargs = locals_.copy()
190
+ for k in ['self', 'path']:
191
+ kwargs.pop(k)
192
+ return kwargs
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides loader-specific utility functions for
5
+ the project.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: This module is here (in the ``docp/loaders``) directory
12
+ rather than merged with the ``docp/parsers/putilities.py``
13
+ module as the loaders' dependencies are *heavy*. Keeping the
14
+ loader functionality separate helps to ease the dependency
15
+ requirements for parser-only projects.
16
+
17
+ """
18
+
19
+ # locals
20
+ try:
21
+ from .libs.utilities import utilities
22
+ from .loaders.chromapdfloader import ChromaPDFLoader
23
+ from .loaders.chromapptxloader import ChromaPPTXLoader
24
+ except ImportError:
25
+ from libs.utilities import utilities
26
+ from loaders.chromapdfloader import ChromaPDFLoader
27
+ from loaders.chromapptxloader import ChromaPPTXLoader
28
+
29
+
30
+ class LoaderUtilities:
31
+ """Loader-based (cross-project) utility functions."""
32
+
33
+ def get_loader(self, path: str) -> ChromaPDFLoader | ChromaPPTXLoader:
34
+ """Return the appropriate loader for the file type.
35
+
36
+ Args:
37
+ path (str): Full path to the file to be tested.
38
+
39
+ Returns:
40
+ ChromaPDFLoader | ChromaPPTXLoader: The appropriate loader
41
+ for the file, given the *file signature*; this test is not
42
+ file extension based.
43
+
44
+ """
45
+ if utilities.ispdf(path=path):
46
+ return ChromaPDFLoader
47
+ if utilities.iszip(path=path):
48
+ return ChromaPPTXLoader
49
+ raise NotImplementedError('A loader is not available for: os.path.basename(path)')
50
+
51
+
52
+ lutilities = LoaderUtilities()
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env python3
2
2
  # -*- coding: utf-8 -*-
3
3
  """
4
- :Purpose: This module provides the implementation for the
5
- document-type-specific base class.
4
+ :Purpose: This module provides the generalised base functionality for
5
+ the document-type-specific base classes.
6
6
 
7
7
  :Platform: Linux/Windows | Python 3.10+
8
8
  :Developer: J Berendt
@@ -12,19 +12,15 @@
12
12
 
13
13
  """
14
14
 
15
- from __future__ import annotations
16
- try:
17
- from .objects._pageobject import PageObject
18
- except ImportError:
19
- from objects._pageobject import PageObject
20
-
21
15
 
22
16
  class _DocBase:
23
17
  """Private document base class.
24
18
 
25
- This class is *not* designed to be interacted with directly, but
26
- rather to be inherited by the document-type-specific document
27
- objects.
19
+ .. attention::
20
+
21
+ This class is *not* designed to be interacted with directly, but
22
+ rather to be inherited by the document-type-specific document
23
+ objects.
28
24
 
29
25
  """
30
26
 
@@ -37,8 +33,6 @@ class _DocBase:
37
33
  self._npages = 0 # Number of pages in the document
38
34
  self._ntables = 0 # Number of tables extracted
39
35
  self._parser = None # Underlying document parser functionality
40
- # List of PageObjects, offset by 1 to align the index with page numbers.
41
- self._pages = [PageObject(pageno=0)]
42
36
 
43
37
  @property
44
38
  def basename(self) -> str:
@@ -65,11 +59,6 @@ class _DocBase:
65
59
  """The number of tables successfully extracted from the source."""
66
60
  return self._ntables
67
61
 
68
- @property
69
- def pages(self) -> list[PageObject]: # noqa pylint: disable=undefined-variable
70
- """A list of containing an object for each page in the document."""
71
- return self._pages
72
-
73
62
  @property
74
63
  def parser(self) -> object:
75
64
  """Accessor to the underlying document parser's functionality."""
File without changes
@@ -1,7 +1,8 @@
1
1
  #!/usr/bin/env python3
2
2
  # -*- coding: utf-8 -*-
3
3
  """
4
- :Purpose: This module provides the ``page`` object implementation.
4
+ :Purpose: This module provides the implementation for the
5
+ ``PageObject`` object.
5
6
 
6
7
  :Platform: Linux/Windows | Python 3.10+
7
8
  :Developer: J Berendt
@@ -82,7 +83,7 @@ class PageObject:
82
83
  """Accessor to the page number.
83
84
 
84
85
  Note:
85
- This is the page number 1-n, concerning the page's *sequence
86
+ This is the page number with regard to the page's *sequence
86
87
  in the overall document*. This is *not* guaranteed to be the
87
88
  page's number per the document's page labeling scheme.
88
89
 
@@ -0,0 +1,110 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the implementation for the
5
+ ``SlideObject`` object.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+
16
+ class SlideObject:
17
+ r"""This class provides the implementation for the ``SlideObject``.
18
+
19
+ For each slide in a document (e.g. PowerPoint), an instance of this
20
+ class is created, populated and appended into the PPTX document's
21
+ ``slides`` list attribute.
22
+
23
+ Args:
24
+ pageno (int, optional): Page number. Defaults to 0.
25
+ parser (object, optional): The underlying document parser object.
26
+ Defaults to None.
27
+
28
+ .. tip::
29
+ To display the textual contents of a slide, simply call the
30
+ following, where 42 is the slide to be displayed::
31
+
32
+ >>> print(*pptx.doc.slides[42].texts, sep='\n\n')
33
+
34
+ """
35
+
36
+ __slots__ = ('_imgs', '_tables', '_texts', '_pageno', '_parser')
37
+
38
+ def __init__(self, pageno: int=0, parser: object=None):
39
+ """Slide object class initialiser."""
40
+ self._imgs = []
41
+ self._tables = []
42
+ self._texts = []
43
+ self._pageno = pageno
44
+ self._parser = parser
45
+
46
+ def __repr__(self) -> str:
47
+ """Formatted representation of this object."""
48
+ return f'<Slide: {self._pageno}>'
49
+
50
+ def __str__(self) -> str:
51
+ """Formatted representation of this object, when printed."""
52
+ if self._pageno == 0:
53
+ return f'<Slide: {self._pageno}; <index offset>>'
54
+ return (f'<Slide: {self._pageno}; '
55
+ f'Text blocks: {len(self._texts)}; '
56
+ f'Tables: {len(self._tables)}; '
57
+ f'Images: {len(self._imgs)}; '
58
+ f'Parser: {bool(self._parser)}>')
59
+
60
+ @property
61
+ def content(self) -> str:
62
+ """Accessor to the textual content of a slide.
63
+
64
+ Returns:
65
+ str: A concatenated string for all text objects found on the
66
+ slide; each object separated by a double-newline.
67
+
68
+ """
69
+ return '\n\n'.join(i.content for i in self._texts)
70
+
71
+ @property
72
+ def images(self) -> list:
73
+ """Accessor to a slide's image objects."""
74
+ return self._imgs
75
+
76
+ @property
77
+ def pageno(self) -> int:
78
+ """Accessor to the page number.
79
+
80
+ Note:
81
+ This is the page number with regard to the page's *sequence
82
+ in the overall document*. This is *not* guaranteed to be the
83
+ page's number per the document's page labeling scheme.
84
+
85
+ """
86
+ return self._pageno
87
+
88
+ @property
89
+ def parser(self) -> object:
90
+ """Accessor to the document parser's internal functionality.
91
+
92
+ Note:
93
+ The population of this property is determined by the
94
+ document-type-specific ``docp`` parser. If the underlying
95
+ parsing library has functionality worth preserving and making
96
+ available to the user, it is stored to this property.
97
+ Otherwise, this property will remain as ``None``.
98
+
99
+ """
100
+ return self._parser
101
+
102
+ @property
103
+ def tables(self) -> list:
104
+ """Accessor to a slide's table objects."""
105
+ return self._tables
106
+
107
+ @property
108
+ def texts(self) -> list:
109
+ """Accessor to a slide's text objects."""
110
+ return self._texts