mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,414 @@
1
+ """
2
+ HTMLToMarkdown module
3
+ """
4
+
5
+ import re
6
+
7
+ # Conditional import
8
+ try:
9
+ from bs4 import BeautifulSoup, NavigableString
10
+
11
+ SOUP = True
12
+ except ImportError:
13
+ SOUP = False
14
+
15
+ from ..base import Pipeline
16
+
17
+
18
+ class HTMLToMarkdown(Pipeline):
19
+ """
20
+ HTML to Markdown pipeline.
21
+
22
+ Markdown formatting is applied for headings, blockquotes, lists, code, tables and text. Visual formatting is also
23
+ included (bold, italic etc).
24
+
25
+ This pipeline searches for the best node that has relevant text, often found with an article, main or body tag.
26
+ """
27
+
28
+ def __init__(self, paragraphs=False, sections=False):
29
+ """
30
+ Create a new Extract instance.
31
+
32
+ Args:
33
+ paragraphs: True if paragraph parsing enabled, False otherwise
34
+ sections: True if section parsing enabled, False otherwise
35
+ """
36
+
37
+ if not SOUP:
38
+ raise ImportError('HTMLToMarkdown pipeline is not available - install "pipeline" extra to enable')
39
+
40
+ self.paragraphs = paragraphs
41
+ self.sections = sections
42
+
43
+ def __call__(self, html):
44
+ """
45
+ Transforms input HTML into Markdown formatted text.
46
+
47
+ Args:
48
+ html: input html
49
+
50
+ Returns:
51
+ markdown formatted text
52
+ """
53
+
54
+ # HTML Parser
55
+ soup = BeautifulSoup(html, features="html.parser")
56
+
57
+ # Ignore script and style tags
58
+ for script in soup.find_all(["script", "style"]):
59
+ script.decompose()
60
+
61
+ # Check for article sections
62
+ article = next((x for x in ["article", "main"] if soup.find(x)), None)
63
+
64
+ # Extract text from each section element
65
+ nodes = []
66
+ for node in soup.find_all(article if article else "body"):
67
+ # Skip article sections without at least 1 paragraph
68
+ if not article or node.find("p"):
69
+ nodes.append(self.process(node, article))
70
+
71
+ # Return extracted text, fallback to default text extraction if no nodes found
72
+ return "\n".join(self.metadata(soup) + nodes) if nodes else self.default(soup)
73
+
74
+ def process(self, node, article):
75
+ """
76
+ Extracts text from a node. This method applies transforms for headings, blockquotes, lists, code, tables and text.
77
+ Page breaks are detected and reflected in the output text as a page break character.
78
+
79
+ Args:
80
+ node: input node
81
+ article: True if the main section node is an article
82
+
83
+ Returns:
84
+ node text
85
+ """
86
+
87
+ if self.isheader(node):
88
+ return self.header(node, article)
89
+
90
+ if node.name in ("blockquote", "q"):
91
+ return self.block(node)
92
+
93
+ if node.name in ("ul", "ol"):
94
+ return self.items(node, article)
95
+
96
+ if node.name in ("code", "pre"):
97
+ return self.code(node)
98
+
99
+ if node.name == "table":
100
+ return self.table(node, article)
101
+
102
+ # Nodes to skip
103
+ if node.name in ("aside",) + (() if article else ("header", "footer")):
104
+ return ""
105
+
106
+ # Get page break symbol, if available
107
+ page = node.name and node.get("class") and "page" in node.get("class")
108
+
109
+ # Get node children
110
+ children = self.children(node)
111
+
112
+ # Join elements into text
113
+ if self.iscontainer(node, children):
114
+ texts = [self.process(node, article) for node in children]
115
+ text = "\n".join(text for text in texts if text or not article)
116
+ else:
117
+ text = self.text(node, article)
118
+
119
+ # Add page breaks, if section parsing enabled. Otherwise add node text.
120
+ return f"{text}\f" if page and self.sections else text
121
+
122
+ def metadata(self, node):
123
+ """
124
+ Builds a metadata section. The metadata section consists of the title and
125
+ description fields.
126
+
127
+ Args:
128
+ node: input document node
129
+
130
+ Returns:
131
+ metadata as a list
132
+ """
133
+
134
+ title = node.find("title")
135
+ metadata = [f"**{title.text.strip()}**"] if title and title.text else []
136
+
137
+ description = node.find("meta", attrs={"name": "description"})
138
+ if description and description["content"]:
139
+ metadata.append(f"\n*{description['content'].strip()}*")
140
+
141
+ # Add separator
142
+ if metadata:
143
+ metadata.append("\f" if self.sections else "\n\n")
144
+
145
+ return metadata
146
+
147
+ def default(self, soup):
148
+ """
149
+ Default text handler when valid HTML isn't detected.
150
+
151
+ Args:
152
+ soup: BeautifulSoup object
153
+
154
+ Returns:
155
+ text
156
+ """
157
+
158
+ lines = []
159
+ for line in soup.get_text().split("\n"):
160
+ # Detect markdown headings and add page breaks
161
+ lines.append(f"\f{line}" if self.sections and re.search(r"^#+ ", line) else line)
162
+
163
+ return "\n".join(lines)
164
+
165
+ def text(self, node, article):
166
+ """
167
+ Text handler. This method flattens a node and it's children to text.
168
+
169
+ Args:
170
+ node: input node
171
+ article: True if the main section node is an article
172
+
173
+ Returns:
174
+ node text
175
+ """
176
+
177
+ # Get node children if available, otherwise use node as item
178
+ items = self.children(node)
179
+ items = items if items else [node]
180
+
181
+ # Apply emphasis and link formatting
182
+ texts = []
183
+ for x in items:
184
+ target, text = x if x.name else node, x.text
185
+
186
+ if text.strip():
187
+ if target.name in ("b", "strong"):
188
+ text = f"**{text.strip()}** "
189
+ elif target.name in ("i", "em"):
190
+ text = f"*{text.strip()}* "
191
+ elif target.name == "a":
192
+ text = f"[{text.strip()}]({target.get('href')}) "
193
+
194
+ texts.append(text)
195
+
196
+ # Join text elements
197
+ text = "".join(texts)
198
+
199
+ # Article text processing
200
+ text = self.articletext(node, text) if article else text
201
+
202
+ # Return text, strip leading/trailing whitespace if this is a string only node
203
+ text = text if node.name and text else text.strip()
204
+
205
+ return text
206
+
207
+ def header(self, node, article):
208
+ """
209
+ Header handler. This method transforms a HTML heading into a Markdown formatted heading.
210
+
211
+ Args:
212
+ node: input node
213
+ article: True if the main section node is an article
214
+
215
+ Returns:
216
+ heading as markdown
217
+ """
218
+
219
+ # Get heading level and text
220
+ level = "#" * int(node.name[1])
221
+ text = self.text(node, article)
222
+
223
+ # Add section break or newline, if necessary
224
+ level = f"\f{level}" if self.sections else f"\n{level}"
225
+
226
+ # Return formatted header. Remove leading whitespace as it was added before level in step above.
227
+ return f"{level} {text.lstrip()}" if text.strip() else ""
228
+
229
+ def block(self, node):
230
+ """
231
+ Blockquote handler. This method transforms a HTML blockquote or q block into a Markdown formatted
232
+ blockquote
233
+
234
+ Args:
235
+ node: input node
236
+
237
+ Returns:
238
+ block as markdown
239
+ """
240
+
241
+ text = "\n".join(f"> {x}" for x in node.text.strip().split("\n"))
242
+ return f"{text}\n\n" if self.paragraphs else f"{text}\n"
243
+
244
+ def items(self, node, article):
245
+ """
246
+ List handler. This method transforms a HTML ordered/unordered list into a Markdown formatted list.
247
+
248
+ Args:
249
+ node: input node
250
+ article: True if the main section node is an article
251
+
252
+ Returns:
253
+ list as markdown
254
+ """
255
+
256
+ elements = []
257
+ for x, element in enumerate(node.find_all("li")):
258
+ # Unordered lists use dashes. Ordered lists use numbers.
259
+ prefix = "-" if node.name == "ul" else f"{x + 1}."
260
+
261
+ # List item text
262
+ text = self.process(element, article)
263
+
264
+ # Add list element
265
+ if text:
266
+ elements.append(f"{prefix} {text}")
267
+
268
+ # Join elements together as string
269
+ return "\n".join(elements)
270
+
271
+ def code(self, node):
272
+ """
273
+ Code block handler. This method transforms a HTML pre or code block into a Markdown formatted
274
+ code block.
275
+
276
+ Args:
277
+ node: input node
278
+
279
+ Returns:
280
+ code as markdown
281
+ """
282
+
283
+ text = f"```\n{node.text.strip()}\n```"
284
+ return f"{text}\n\n" if self.paragraphs else f"{text}\n"
285
+
286
+ def table(self, node, article):
287
+ """
288
+ Table handler. This method transforms a HTML table into a Markdown formatted table.
289
+
290
+ Args:
291
+ node: input node
292
+ article: True if the main section node is an article
293
+
294
+ Returns:
295
+ table as markdown
296
+ """
297
+
298
+ elements, header = [], False
299
+
300
+ # Process all rows
301
+ rows = node.find_all("tr")
302
+ for row in rows:
303
+ # Get list of columns for row
304
+ columns = row.find_all(lambda tag: tag.name in ("th", "td"))
305
+
306
+ # Add columns with separator
307
+ elements.append(f"|{'|'.join(self.process(column, article) for column in columns)}|")
308
+
309
+ # If there are multiple rows, add header format row
310
+ if not header and len(rows) > 1:
311
+ elements.append(f"{'|---' * len(columns)}|")
312
+ header = True
313
+
314
+ # Join elements together as string
315
+ return "\n".join(elements)
316
+
317
+ def iscontainer(self, node, children):
318
+ """
319
+ Analyzes a node and it's children to determine if this is a container element. A container
320
+ element is defined as being a div, body, article or not having any string elements as children.
321
+
322
+ Args:
323
+ node: input node
324
+ nodes: input node's children
325
+
326
+ Returns:
327
+ True if this is a container element, False otherwise
328
+ """
329
+
330
+ return children and (node.name in ("div", "body", "article") or not any(isinstance(x, NavigableString) for x in children))
331
+
332
+ def children(self, node):
333
+ """
334
+ Gets the node children, if available.
335
+
336
+ Args:
337
+ node: input node
338
+
339
+ Returns:
340
+ node children or None if not available
341
+ """
342
+
343
+ if node.name and node.contents:
344
+ # Iterate over children and remove whitespace-only string nodes
345
+ return [node for node in node.contents if node.name or node.text.strip()]
346
+
347
+ return None
348
+
349
+ def articletext(self, node, text):
350
+ """
351
+ Transforms node text using article parsing rules. Article parsing is designed to extract text content from web articles.
352
+ It ignores navigation headers and other superfluous elements.
353
+
354
+ Args:
355
+ node: input node
356
+ text: current text
357
+
358
+ Returns:
359
+ article text
360
+ """
361
+
362
+ # List of valid text nodes
363
+ valid = ("p", "th", "td", "li", "a", "b", "strong", "i", "em")
364
+
365
+ # Check if this node is valid or it's part of a table cell
366
+ valid = node.name in valid or (node.parent and node.parent.name in ("th", "td"))
367
+
368
+ # Check if text is valid article text
369
+ text = text if (valid or self.isheader(node)) and not self.islink(node) else ""
370
+ if text:
371
+ # Replace non-breaking space plus newline with double newline
372
+ text = text.replace("\xa0\n", "\n\n")
373
+
374
+ # Format paragraph whitespace
375
+ if node.name == "p":
376
+ text = f"{text.strip()}\n\n" if self.paragraphs else f"{text.strip()}\n"
377
+
378
+ return text
379
+
380
+ def isheader(self, node):
381
+ """
382
+ Checks if node is a header node.
383
+
384
+ Args:
385
+ node: input node
386
+
387
+ Returns:
388
+ True if node is a header node, False otherwise
389
+ """
390
+
391
+ return node.name in ("h1", "h2", "h3", "h4", "h5", "h6")
392
+
393
+ def islink(self, node):
394
+ """
395
+ Checks if node is a link node. This method does not consider links without tables as link nodes.
396
+
397
+ Args:
398
+ node: input node
399
+
400
+ Returns:
401
+ True if node is a link node, False otherwise
402
+ """
403
+
404
+ # Check if this is a link node or link container
405
+ link, parent = False, node
406
+ while parent:
407
+ if parent.name == "a":
408
+ link = True
409
+ break
410
+
411
+ parent = parent.parent
412
+
413
+ # Return if this node or any parents are a link. Ignore links in table cells.
414
+ return link and node.parent.name not in ("th", "td")
@@ -0,0 +1,178 @@
1
+ """
2
+ Segmentation module
3
+ """
4
+
5
+ import re
6
+
7
+ # Conditional import
8
+ try:
9
+ from nltk import sent_tokenize
10
+
11
+ NLTK = True
12
+ except ImportError:
13
+ NLTK = False
14
+
15
+ # Conditional import
16
+ try:
17
+ import chonkie
18
+
19
+ CHONKIE = True
20
+ except ImportError:
21
+ CHONKIE = False
22
+
23
+ from ..base import Pipeline
24
+
25
+
26
+ class Segmentation(Pipeline):
27
+ """
28
+ Segments text into logical units.
29
+ """
30
+
31
+ def __init__(
32
+ self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False, sections=False, cleantext=True, chunker=None, **kwargs
33
+ ):
34
+ """
35
+ Creates a new Segmentation pipeline.
36
+
37
+ Args:
38
+ sentences: tokenize text into sentences if True, defaults to False
39
+ lines: tokenizes text into lines if True, defaults to False
40
+ paragraphs: tokenizes text into paragraphs if True, defaults to False
41
+ minlength: require at least minlength characters per text element, defaults to None
42
+ join: joins tokenized sections back together if True, defaults to False
43
+ sections: tokenizes text into sections if True, defaults to False. Splits using section or page breaks, depending on what's available
44
+ cleantext: apply text cleaning rules, defaults to True
45
+ chunker: creates a third-party chunker to tokenize text if set, defaults to None
46
+ kwargs: additional keyword arguments
47
+ """
48
+
49
+ if not NLTK and sentences:
50
+ raise ImportError('NLTK is not available - install "pipeline" extra to enable')
51
+
52
+ if not CHONKIE and chunker:
53
+ raise ImportError('Chonkie is not available - install "pipeline" extra to enable')
54
+
55
+ self.sentences = sentences
56
+ self.lines = lines
57
+ self.paragraphs = paragraphs
58
+ self.sections = sections
59
+ self.minlength = minlength
60
+ self.join = join
61
+ self.cleantext = cleantext
62
+
63
+ # Create a third-party chunker, if applicable
64
+ self.chunker = self.createchunker(chunker, **kwargs) if chunker else None
65
+
66
+ def __call__(self, text):
67
+ """
68
+ Segments text into semantic units.
69
+
70
+ This method supports text as a string or a list. If the input is a string, the return
71
+ type is text|list. If text is a list, a list of returned, this could be a
72
+ list of text or a list of lists depending on the tokenization strategy.
73
+
74
+ Args:
75
+ text: text|list
76
+
77
+ Returns:
78
+ segmented text
79
+ """
80
+
81
+ # Get inputs
82
+ texts = [text] if not isinstance(text, list) else text
83
+
84
+ # Extract text for each input file
85
+ results = []
86
+ for value in texts:
87
+ # Get text
88
+ value = self.text(value)
89
+
90
+ # Parse and add extracted results
91
+ results.append(self.parse(value))
92
+
93
+ return results[0] if isinstance(text, str) else results
94
+
95
+ def text(self, text):
96
+ """
97
+ Hook to allow extracting text out of input text object.
98
+
99
+ Args:
100
+ text: object to extract text from
101
+ """
102
+
103
+ return text
104
+
105
+ def parse(self, text):
106
+ """
107
+ Splits and cleans text based on the current parameters.
108
+
109
+ Args:
110
+ text: input text
111
+
112
+ Returns:
113
+ parsed and clean content
114
+ """
115
+
116
+ content = None
117
+
118
+ if self.chunker:
119
+ # pylint: disable=E1102
120
+ content = [self.clean(x.text) for x in self.chunker(text)]
121
+ elif self.sentences:
122
+ content = [self.clean(x) for x in sent_tokenize(text)]
123
+ elif self.lines:
124
+ content = [self.clean(x) for x in re.split(r"\n{1,}", text)]
125
+ elif self.paragraphs:
126
+ content = [self.clean(x) for x in re.split(r"\n{2,}", text)]
127
+ elif self.sections:
128
+ split = r"\f" if "\f" in text else r"\n{3,}"
129
+ content = [self.clean(x) for x in re.split(split, text)]
130
+ else:
131
+ content = self.clean(text)
132
+
133
+ # Text tokenization enabled
134
+ if isinstance(content, list):
135
+ # Remove empty strings
136
+ content = [x for x in content if x]
137
+ return " ".join(content) if self.join else content
138
+
139
+ # Default method that returns clean text
140
+ return content
141
+
142
+ def clean(self, text):
143
+ """
144
+ Applies a series of rules to clean text.
145
+
146
+ Args:
147
+ text: input text
148
+
149
+ Returns:
150
+ clean text
151
+ """
152
+
153
+ # Text cleaning disabled, return original text
154
+ if not self.cleantext:
155
+ return text
156
+
157
+ # Collapse and remove excess whitespace
158
+ text = re.sub(r" +", " ", text)
159
+ text = text.strip()
160
+
161
+ # If minlength enabled, require at least minlength chars
162
+ return text if not self.minlength or len(text) >= self.minlength else None
163
+
164
+ def createchunker(self, chunker, **kwargs):
165
+ """
166
+ Creates a new third-party chunker
167
+
168
+ Args:
169
+ chunker: name of chunker to create
170
+ kwargs: additional keyword arguments
171
+
172
+ Returns:
173
+ new chunker
174
+ """
175
+
176
+ # Resolve and create a third-party chunker
177
+ chunker = f"{chunker[0].upper() + chunker[1:]}Chunker"
178
+ return getattr(chonkie, chunker)(**kwargs)