pointblank 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +25 -1
- pointblank/_constants_translations.py +2361 -2
- pointblank/_interrogation.py +24 -0
- pointblank/_typing.py +37 -9
- pointblank/_utils.py +0 -355
- pointblank/_utils_llms_txt.py +661 -0
- pointblank/column.py +24 -0
- pointblank/data/api-docs.txt +336 -3
- pointblank/validate.py +2551 -926
- pointblank/yaml.py +10 -2
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/METADATA +9 -4
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/RECORD +17 -16
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/WHEEL +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,661 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from urllib.parse import urljoin
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
SCRAPING_AVAILABLE = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
SCRAPING_AVAILABLE = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_api_details(module, exported_list):
|
|
16
|
+
"""
|
|
17
|
+
Retrieve the signatures and docstrings of the functions/classes in the exported list.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
module : module
|
|
22
|
+
The module from which to retrieve the functions/classes.
|
|
23
|
+
exported_list : list
|
|
24
|
+
A list of function/class names as strings.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
str
|
|
29
|
+
A string containing the combined class name, signature, and docstring.
|
|
30
|
+
"""
|
|
31
|
+
api_text = ""
|
|
32
|
+
|
|
33
|
+
for fn in exported_list:
|
|
34
|
+
# Split the attribute path to handle nested attributes
|
|
35
|
+
parts = fn.split(".")
|
|
36
|
+
obj = module
|
|
37
|
+
for part in parts:
|
|
38
|
+
obj = getattr(obj, part)
|
|
39
|
+
|
|
40
|
+
# Get the name of the object
|
|
41
|
+
obj_name = obj.__name__
|
|
42
|
+
|
|
43
|
+
# Get the function signature
|
|
44
|
+
sig = inspect.signature(obj)
|
|
45
|
+
|
|
46
|
+
# Get the docstring
|
|
47
|
+
doc = obj.__doc__
|
|
48
|
+
|
|
49
|
+
# Combine the class name, signature, and docstring
|
|
50
|
+
api_text += f"{obj_name}{sig}\n{doc}\n\n"
|
|
51
|
+
|
|
52
|
+
return api_text
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _get_api_text() -> str:
|
|
56
|
+
"""
|
|
57
|
+
Get the API documentation for the Pointblank library.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
str
|
|
62
|
+
The API documentation for the Pointblank library.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
import pointblank
|
|
66
|
+
|
|
67
|
+
sep_line = "-" * 70
|
|
68
|
+
|
|
69
|
+
api_text = (
|
|
70
|
+
f"{sep_line}\nThis is the API documentation for the Pointblank library.\n{sep_line}\n\n"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
#
|
|
74
|
+
# Lists of exported functions and methods in different families
|
|
75
|
+
#
|
|
76
|
+
|
|
77
|
+
validate_exported = [
|
|
78
|
+
"Validate",
|
|
79
|
+
"Thresholds",
|
|
80
|
+
"Actions",
|
|
81
|
+
"FinalActions",
|
|
82
|
+
"Schema",
|
|
83
|
+
"DraftValidation",
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
val_steps_exported = [
|
|
87
|
+
"Validate.col_vals_gt",
|
|
88
|
+
"Validate.col_vals_lt",
|
|
89
|
+
"Validate.col_vals_ge",
|
|
90
|
+
"Validate.col_vals_le",
|
|
91
|
+
"Validate.col_vals_eq",
|
|
92
|
+
"Validate.col_vals_ne",
|
|
93
|
+
"Validate.col_vals_between",
|
|
94
|
+
"Validate.col_vals_outside",
|
|
95
|
+
"Validate.col_vals_in_set",
|
|
96
|
+
"Validate.col_vals_not_in_set",
|
|
97
|
+
"Validate.col_vals_increasing",
|
|
98
|
+
"Validate.col_vals_decreasing",
|
|
99
|
+
"Validate.col_vals_null",
|
|
100
|
+
"Validate.col_vals_not_null",
|
|
101
|
+
"Validate.col_vals_regex",
|
|
102
|
+
"Validate.col_vals_within_spec",
|
|
103
|
+
"Validate.col_vals_expr",
|
|
104
|
+
"Validate.rows_distinct",
|
|
105
|
+
"Validate.rows_complete",
|
|
106
|
+
"Validate.col_exists",
|
|
107
|
+
"Validate.col_pct_null",
|
|
108
|
+
"Validate.col_schema_match",
|
|
109
|
+
"Validate.row_count_match",
|
|
110
|
+
"Validate.col_count_match",
|
|
111
|
+
"Validate.tbl_match",
|
|
112
|
+
"Validate.conjointly",
|
|
113
|
+
"Validate.specially",
|
|
114
|
+
"Validate.prompt",
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
column_selection_exported = [
|
|
118
|
+
"col",
|
|
119
|
+
"starts_with",
|
|
120
|
+
"ends_with",
|
|
121
|
+
"contains",
|
|
122
|
+
"matches",
|
|
123
|
+
"everything",
|
|
124
|
+
"first_n",
|
|
125
|
+
"last_n",
|
|
126
|
+
"expr_col",
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
segments_exported = [
|
|
130
|
+
"seg_group",
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
interrogation_exported = [
|
|
134
|
+
"Validate.interrogate",
|
|
135
|
+
"Validate.set_tbl",
|
|
136
|
+
"Validate.get_tabular_report",
|
|
137
|
+
"Validate.get_step_report",
|
|
138
|
+
"Validate.get_json_report",
|
|
139
|
+
"Validate.get_sundered_data",
|
|
140
|
+
"Validate.get_data_extracts",
|
|
141
|
+
"Validate.all_passed",
|
|
142
|
+
"Validate.assert_passing",
|
|
143
|
+
"Validate.assert_below_threshold",
|
|
144
|
+
"Validate.above_threshold",
|
|
145
|
+
"Validate.n",
|
|
146
|
+
"Validate.n_passed",
|
|
147
|
+
"Validate.n_failed",
|
|
148
|
+
"Validate.f_passed",
|
|
149
|
+
"Validate.f_failed",
|
|
150
|
+
"Validate.warning",
|
|
151
|
+
"Validate.error",
|
|
152
|
+
"Validate.critical",
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
inspect_exported = [
|
|
156
|
+
"DataScan",
|
|
157
|
+
"preview",
|
|
158
|
+
"col_summary_tbl",
|
|
159
|
+
"missing_vals_tbl",
|
|
160
|
+
"assistant",
|
|
161
|
+
"load_dataset",
|
|
162
|
+
"get_data_path",
|
|
163
|
+
"connect_to_table",
|
|
164
|
+
"print_database_tables",
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
yaml_exported = [
|
|
168
|
+
"yaml_interrogate",
|
|
169
|
+
"validate_yaml",
|
|
170
|
+
"yaml_to_python",
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
utility_exported = [
|
|
174
|
+
"get_column_count",
|
|
175
|
+
"get_row_count",
|
|
176
|
+
"get_action_metadata",
|
|
177
|
+
"get_validation_summary",
|
|
178
|
+
"write_file",
|
|
179
|
+
"read_file",
|
|
180
|
+
"config",
|
|
181
|
+
]
|
|
182
|
+
|
|
183
|
+
prebuilt_actions_exported = [
|
|
184
|
+
"send_slack_notification",
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
validate_desc = """When peforming data validation, you'll need the `Validate` class to get the
|
|
188
|
+
process started. It's given the target table and you can optionally provide some metadata and/or
|
|
189
|
+
failure thresholds (using the `Thresholds` class or through shorthands for this task). The
|
|
190
|
+
`Validate` class has numerous methods for defining validation steps and for obtaining
|
|
191
|
+
post-interrogation metrics and data."""
|
|
192
|
+
|
|
193
|
+
val_steps_desc = """Validation steps can be thought of as sequential validations on the target
|
|
194
|
+
data. We call `Validate`'s validation methods to build up a validation plan: a collection of steps
|
|
195
|
+
that, in the aggregate, provides good validation coverage."""
|
|
196
|
+
|
|
197
|
+
column_selection_desc = """A flexible way to select columns for validation is to use the `col()`
|
|
198
|
+
function along with column selection helper functions. A combination of `col()` + `starts_with()`,
|
|
199
|
+
`matches()`, etc., allows for the selection of multiple target columns (mapping a validation across
|
|
200
|
+
many steps). Furthermore, the `col()` function can be used to declare a comparison column (e.g.,
|
|
201
|
+
for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
|
|
202
|
+
for comparison."""
|
|
203
|
+
|
|
204
|
+
segments_desc = (
|
|
205
|
+
"""Combine multiple values into a single segment using `seg_*()` helper functions."""
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
interrogation_desc = """The validation plan is put into action when `interrogate()` is called.
|
|
209
|
+
The workflow for performing a comprehensive validation is then: (1) `Validate()`, (2) adding
|
|
210
|
+
validation steps, (3) `interrogate()`. After interrogation of the data, we can view a validation
|
|
211
|
+
report table (by printing the object or using `get_tabular_report()`), extract key metrics, or we
|
|
212
|
+
can split the data based on the validation results (with `get_sundered_data()`)."""
|
|
213
|
+
|
|
214
|
+
inspect_desc = """The *Inspection and Assistance* group contains functions that are helpful for
|
|
215
|
+
getting to grips on a new data table. Use the `DataScan` class to get a quick overview of the data,
|
|
216
|
+
`preview()` to see the first and last few rows of a table, `col_summary_tbl()` for a column-level
|
|
217
|
+
summary of a table, `missing_vals_tbl()` to see where there are missing values in a table, and
|
|
218
|
+
`get_column_count()`/`get_row_count()` to get the number of columns and rows in a table. Several
|
|
219
|
+
datasets included in the package can be accessed via the `load_dataset()` function. Finally, the
|
|
220
|
+
`config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
|
|
221
|
+
the `assistant()` function to get help with Pointblank."""
|
|
222
|
+
|
|
223
|
+
yaml_desc = """The *YAML* group contains functions that allow for the use of YAML to orchestrate
|
|
224
|
+
validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow
|
|
225
|
+
from YAML strings or files. The `validate_yaml()` function checks if the YAML configuration passes
|
|
226
|
+
its own validity checks. The `yaml_to_python()` function converts YAML configuration to equivalent
|
|
227
|
+
Python code."""
|
|
228
|
+
|
|
229
|
+
utility_desc = """The Utility Functions group contains functions that are useful for accessing
|
|
230
|
+
metadata about the target data. Use `get_column_count()` or `get_row_count()` to get the number of
|
|
231
|
+
columns or rows in a table. The `get_action_metadata()` function is useful when building custom
|
|
232
|
+
actions since it returns metadata about the validation step that's triggering the action. Lastly,
|
|
233
|
+
the `config()` utility lets us set global configuration parameters."""
|
|
234
|
+
|
|
235
|
+
prebuilt_actions_desc = """The Prebuilt Actions group contains a function that can be used to
|
|
236
|
+
send a Slack notification when validation steps exceed failure threshold levels or just to provide a
|
|
237
|
+
summary of the validation results, including the status, number of steps, passing and failing steps,
|
|
238
|
+
table information, and timing details."""
|
|
239
|
+
|
|
240
|
+
#
|
|
241
|
+
# Add headings (`*_desc` text) and API details for each family of functions/methods
|
|
242
|
+
#
|
|
243
|
+
|
|
244
|
+
api_text += f"""\n## The Validate family\n\n{validate_desc}\n\n"""
|
|
245
|
+
api_text += get_api_details(module=pointblank, exported_list=validate_exported)
|
|
246
|
+
|
|
247
|
+
api_text += f"""\n## The Validation Steps family\n\n{val_steps_desc}\n\n"""
|
|
248
|
+
api_text += get_api_details(module=pointblank, exported_list=val_steps_exported)
|
|
249
|
+
|
|
250
|
+
api_text += f"""\n## The Column Selection family\n\n{column_selection_desc}\n\n"""
|
|
251
|
+
api_text += get_api_details(module=pointblank, exported_list=column_selection_exported)
|
|
252
|
+
|
|
253
|
+
api_text += f"""\n## The Segments family\n\n{segments_desc}\n\n"""
|
|
254
|
+
api_text += get_api_details(module=pointblank, exported_list=segments_exported)
|
|
255
|
+
|
|
256
|
+
api_text += f"""\n## The Interrogation and Reporting family\n\n{interrogation_desc}\n\n"""
|
|
257
|
+
api_text += get_api_details(module=pointblank, exported_list=interrogation_exported)
|
|
258
|
+
|
|
259
|
+
api_text += f"""\n## The Inspection and Assistance family\n\n{inspect_desc}\n\n"""
|
|
260
|
+
api_text += get_api_details(module=pointblank, exported_list=inspect_exported)
|
|
261
|
+
|
|
262
|
+
api_text += f"""\n## The YAML family\n\n{yaml_desc}\n\n"""
|
|
263
|
+
api_text += get_api_details(module=pointblank, exported_list=yaml_exported)
|
|
264
|
+
|
|
265
|
+
api_text += f"""\n## The Utility Functions family\n\n{utility_desc}\n\n"""
|
|
266
|
+
api_text += get_api_details(module=pointblank, exported_list=utility_exported)
|
|
267
|
+
|
|
268
|
+
api_text += f"""\n## The Prebuilt Actions family\n\n{prebuilt_actions_desc}\n\n"""
|
|
269
|
+
api_text += get_api_details(module=pointblank, exported_list=prebuilt_actions_exported)
|
|
270
|
+
|
|
271
|
+
# Modify language syntax in all code cells
|
|
272
|
+
api_text = api_text.replace("{python}", "python")
|
|
273
|
+
|
|
274
|
+
# Remove code cells that contain `#| echo: false` (i.e., don't display the code)
|
|
275
|
+
api_text = re.sub(r"```python\n\s*.*\n\s*.*\n.*\n.*\n.*```\n\s*", "", api_text)
|
|
276
|
+
|
|
277
|
+
return api_text
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _get_examples_text() -> str:
|
|
281
|
+
"""
|
|
282
|
+
Get the examples for the Pointblank library. These examples are extracted from the Quarto
|
|
283
|
+
documents in the `docs/demos` directory.
|
|
284
|
+
|
|
285
|
+
Returns
|
|
286
|
+
-------
|
|
287
|
+
str
|
|
288
|
+
The examples for the Pointblank library.
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
sep_line = "-" * 70
|
|
292
|
+
|
|
293
|
+
examples_text = (
|
|
294
|
+
f"{sep_line}\nThis is a set of examples for the Pointblank library.\n{sep_line}\n\n"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# A large set of examples is available in the docs/demos directory, and each of the
|
|
298
|
+
# subdirectories contains a different example (in the form of a Quarto document)
|
|
299
|
+
|
|
300
|
+
example_dirs = [
|
|
301
|
+
"01-starter",
|
|
302
|
+
"02-advanced",
|
|
303
|
+
"03-data-extracts",
|
|
304
|
+
"04-sundered-data",
|
|
305
|
+
"05-step-report-column-check",
|
|
306
|
+
"06-step-report-schema-check",
|
|
307
|
+
"apply-checks-to-several-columns",
|
|
308
|
+
"check-row-column-counts",
|
|
309
|
+
"checks-for-missing",
|
|
310
|
+
"col-vals-custom-expr",
|
|
311
|
+
"column-selector-functions",
|
|
312
|
+
"comparisons-across-columns",
|
|
313
|
+
"expect-no-duplicate-rows",
|
|
314
|
+
"expect-no-duplicate-values",
|
|
315
|
+
"expect-text-pattern",
|
|
316
|
+
"failure-thresholds",
|
|
317
|
+
"mutate-table-in-step",
|
|
318
|
+
"numeric-comparisons",
|
|
319
|
+
"schema-check",
|
|
320
|
+
"set-membership",
|
|
321
|
+
"using-parquet-data",
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
for example_dir in example_dirs:
|
|
325
|
+
link = f"https://posit-dev.github.io/pointblank/demos/{example_dir}/"
|
|
326
|
+
|
|
327
|
+
# Read in the index.qmd file for each example
|
|
328
|
+
with open(f"docs/demos/{example_dir}/index.qmd", "r") as f:
|
|
329
|
+
example_text = f.read()
|
|
330
|
+
|
|
331
|
+
# Remove the first eight lines of the example text (contains the YAML front matter)
|
|
332
|
+
example_text = "\n".join(example_text.split("\n")[8:])
|
|
333
|
+
|
|
334
|
+
# Extract the title of the example (the line beginning with `###`)
|
|
335
|
+
title = re.search(r"### (.*)", example_text).group(1)
|
|
336
|
+
|
|
337
|
+
# The next line with text is the short description of the example
|
|
338
|
+
desc = re.search(r"(.*)\.", example_text).group(1)
|
|
339
|
+
|
|
340
|
+
# Get all of the Python code blocks in the example
|
|
341
|
+
# these can be identified as starting with ```python and ending with ```
|
|
342
|
+
code_blocks = re.findall(r"```python\n(.*?)```", example_text, re.DOTALL)
|
|
343
|
+
|
|
344
|
+
# Wrap each code block with a leading ```python and trailing ```
|
|
345
|
+
code_blocks = [f"```python\n{code}```" for code in code_blocks]
|
|
346
|
+
|
|
347
|
+
# Collapse all code blocks into a single string
|
|
348
|
+
code_text = "\n\n".join(code_blocks)
|
|
349
|
+
|
|
350
|
+
# Add the example title, description, and code to the examples text
|
|
351
|
+
examples_text += f"### {title} ({link})\n\n{desc}\n\n{code_text}\n\n"
|
|
352
|
+
|
|
353
|
+
return examples_text
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _get_api_and_examples_text() -> str:
|
|
357
|
+
"""
|
|
358
|
+
Get the combined API and examples text for the Pointblank library.
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
str
|
|
363
|
+
The combined API and examples text for the Pointblank library.
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
api_text = _get_api_text()
|
|
367
|
+
examples_text = _get_examples_text()
|
|
368
|
+
|
|
369
|
+
return f"{api_text}\n\n{examples_text}"
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def scrape_examples_index(base_url: str = "https://posit-dev.github.io/pointblank/") -> list[dict]:
|
|
373
|
+
"""
|
|
374
|
+
Parse the examples index page from local .qmd file to extract demo titles and descriptions.
|
|
375
|
+
|
|
376
|
+
Parameters
|
|
377
|
+
----------
|
|
378
|
+
base_url : str
|
|
379
|
+
The base URL of the Pointblank documentation site.
|
|
380
|
+
|
|
381
|
+
Returns
|
|
382
|
+
-------
|
|
383
|
+
list[dict]
|
|
384
|
+
A list of dictionaries with 'title', 'description', and 'url' keys.
|
|
385
|
+
"""
|
|
386
|
+
examples = []
|
|
387
|
+
|
|
388
|
+
# Read from local file
|
|
389
|
+
qmd_path = Path(__file__).parent.parent / "docs" / "demos" / "index.qmd"
|
|
390
|
+
|
|
391
|
+
if not qmd_path.exists():
|
|
392
|
+
# Fallback to web scraping if local file doesn't exist
|
|
393
|
+
if not SCRAPING_AVAILABLE:
|
|
394
|
+
raise ImportError(
|
|
395
|
+
"requests is required for web scraping. Install it with: pip install requests"
|
|
396
|
+
)
|
|
397
|
+
demos_url = urljoin(base_url, "demos/")
|
|
398
|
+
response = requests.get(demos_url)
|
|
399
|
+
response.raise_for_status()
|
|
400
|
+
content = response.text
|
|
401
|
+
else:
|
|
402
|
+
with open(qmd_path, "r") as f:
|
|
403
|
+
content = f.read()
|
|
404
|
+
|
|
405
|
+
# Pattern to match the example structure in the .qmd file:
|
|
406
|
+
# [Title](./path/index.qmd)
|
|
407
|
+
# ... potentially an image ...
|
|
408
|
+
# <p ...>Description</p>
|
|
409
|
+
|
|
410
|
+
# First, get the grid-based examples with images
|
|
411
|
+
grid_pattern = r"\[([^\]]+)\]\(\./([^)]+)/index\.qmd\).*?<p[^>]*>(.*?)</p>"
|
|
412
|
+
matches = re.findall(grid_pattern, content, re.DOTALL)
|
|
413
|
+
|
|
414
|
+
for title, path, description in matches:
|
|
415
|
+
url = urljoin(base_url, f"demos/{path}/")
|
|
416
|
+
# Clean up description
|
|
417
|
+
desc_clean = re.sub(r"<[^>]+>", "", description).strip()
|
|
418
|
+
examples.append({"title": title.strip(), "description": desc_clean, "url": url})
|
|
419
|
+
|
|
420
|
+
# Also get the list-style examples (after the <hr>)
|
|
421
|
+
list_pattern = r"\[([^\]]+)\]\(\./([^)]+)/index\.qmd\)<br>\s*([^\n]+)"
|
|
422
|
+
list_matches = re.findall(list_pattern, content)
|
|
423
|
+
|
|
424
|
+
for title, path, description in list_matches:
|
|
425
|
+
url = urljoin(base_url, f"demos/{path}/")
|
|
426
|
+
examples.append({"title": title.strip(), "description": description.strip(), "url": url})
|
|
427
|
+
|
|
428
|
+
return examples
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def scrape_api_reference_index(
|
|
432
|
+
base_url: str = "https://posit-dev.github.io/pointblank/",
|
|
433
|
+
) -> list[dict]:
|
|
434
|
+
"""
|
|
435
|
+
Parse the API reference index page from local .qmd file to extract function/class names and descriptions.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
base_url : str
|
|
440
|
+
The base URL of the Pointblank documentation site.
|
|
441
|
+
|
|
442
|
+
Returns
|
|
443
|
+
-------
|
|
444
|
+
list[dict]
|
|
445
|
+
A list of dictionaries with 'title', 'description', and 'url' keys.
|
|
446
|
+
"""
|
|
447
|
+
api_items = []
|
|
448
|
+
|
|
449
|
+
# Read from local file
|
|
450
|
+
qmd_path = Path(__file__).parent.parent / "docs" / "reference" / "index.qmd"
|
|
451
|
+
|
|
452
|
+
if not qmd_path.exists():
|
|
453
|
+
# Fallback to web scraping if local file doesn't exist
|
|
454
|
+
if not SCRAPING_AVAILABLE:
|
|
455
|
+
raise ImportError(
|
|
456
|
+
"requests is required for web scraping. Install it with: pip install requests"
|
|
457
|
+
)
|
|
458
|
+
reference_url = urljoin(base_url, "reference/")
|
|
459
|
+
response = requests.get(reference_url)
|
|
460
|
+
response.raise_for_status()
|
|
461
|
+
content = response.text
|
|
462
|
+
else:
|
|
463
|
+
with open(qmd_path, "r") as f:
|
|
464
|
+
content = f.read()
|
|
465
|
+
|
|
466
|
+
# Pattern to match the API reference structure in the .qmd file:
|
|
467
|
+
# | [Function](path.qmd#anchor) | Description |
|
|
468
|
+
|
|
469
|
+
table_row_pattern = r"\| \[([^\]]+)\]\(([^)]+)\) \| ([^\|]+) \|"
|
|
470
|
+
matches = re.findall(table_row_pattern, content)
|
|
471
|
+
|
|
472
|
+
for title, path, description in matches:
|
|
473
|
+
# Extract just the filename without the anchor and change .qmd to .html
|
|
474
|
+
file_path = path.split("#")[0]
|
|
475
|
+
if file_path.endswith(".qmd"):
|
|
476
|
+
file_path = file_path[:-4] + ".html"
|
|
477
|
+
url = urljoin(base_url, f"reference/{file_path}")
|
|
478
|
+
|
|
479
|
+
api_items.append({"title": title.strip(), "description": description.strip(), "url": url})
|
|
480
|
+
|
|
481
|
+
return api_items
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def generate_llms_txt(
|
|
485
|
+
base_url: str = "https://posit-dev.github.io/pointblank/",
|
|
486
|
+
include_user_guide: bool = True,
|
|
487
|
+
) -> str:
|
|
488
|
+
"""
|
|
489
|
+
Generate the llms.txt content for the Pointblank project.
|
|
490
|
+
|
|
491
|
+
Parameters
|
|
492
|
+
----------
|
|
493
|
+
base_url : str
|
|
494
|
+
The base URL of the Pointblank documentation site.
|
|
495
|
+
include_user_guide : bool
|
|
496
|
+
Whether to include user guide pages in the output.
|
|
497
|
+
|
|
498
|
+
Returns
|
|
499
|
+
-------
|
|
500
|
+
str
|
|
501
|
+
The llms.txt formatted content.
|
|
502
|
+
"""
|
|
503
|
+
if not SCRAPING_AVAILABLE:
|
|
504
|
+
raise ImportError(
|
|
505
|
+
"requests is required for web scraping. Install it with: pip install requests"
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
lines = ["# Pointblank", "", "## Docs", ""]
|
|
509
|
+
|
|
510
|
+
# Add examples section
|
|
511
|
+
try:
|
|
512
|
+
examples = scrape_examples_index(base_url)
|
|
513
|
+
if examples:
|
|
514
|
+
lines.append("### Examples")
|
|
515
|
+
lines.append("")
|
|
516
|
+
for ex in examples:
|
|
517
|
+
desc = f": {ex['description']}" if ex["description"] else ""
|
|
518
|
+
lines.append(f"- [{ex['title']}]({ex['url']}){desc}")
|
|
519
|
+
lines.append("")
|
|
520
|
+
except Exception as e:
|
|
521
|
+
print(f"Warning: Failed to scrape examples index: {e}")
|
|
522
|
+
|
|
523
|
+
# Add API reference section
|
|
524
|
+
try:
|
|
525
|
+
api_items = scrape_api_reference_index(base_url)
|
|
526
|
+
if api_items:
|
|
527
|
+
lines.append("### API Reference")
|
|
528
|
+
lines.append("")
|
|
529
|
+
for item in api_items:
|
|
530
|
+
desc = f": {item['description']}" if item["description"] else ""
|
|
531
|
+
lines.append(f"- [{item['title']}]({item['url']}){desc}")
|
|
532
|
+
lines.append("")
|
|
533
|
+
except Exception as e:
|
|
534
|
+
print(f"Warning: Failed to scrape API reference: {e}")
|
|
535
|
+
|
|
536
|
+
# If user guide is requested, scrape it too
|
|
537
|
+
if include_user_guide:
|
|
538
|
+
try:
|
|
539
|
+
user_guide_items = scrape_user_guide_index(base_url)
|
|
540
|
+
if user_guide_items:
|
|
541
|
+
lines.append("### User Guide")
|
|
542
|
+
lines.append("")
|
|
543
|
+
for item in user_guide_items:
|
|
544
|
+
desc = f": {item['description']}" if item["description"] else ""
|
|
545
|
+
lines.append(f"- [{item['title']}]({item['url']}){desc}")
|
|
546
|
+
except Exception as e:
|
|
547
|
+
print(f"Warning: Failed to scrape user guide: {e}")
|
|
548
|
+
|
|
549
|
+
return "\n".join(lines)
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def scrape_user_guide_index(
|
|
553
|
+
base_url: str = "https://posit-dev.github.io/pointblank/",
|
|
554
|
+
) -> list[dict]:
|
|
555
|
+
"""
|
|
556
|
+
Get the user guide pages from local directory listing.
|
|
557
|
+
|
|
558
|
+
Parameters
|
|
559
|
+
----------
|
|
560
|
+
base_url : str
|
|
561
|
+
The base URL of the Pointblank documentation site.
|
|
562
|
+
|
|
563
|
+
Returns
|
|
564
|
+
-------
|
|
565
|
+
list[dict]
|
|
566
|
+
A list of dictionaries with 'title', 'description', and 'url' keys.
|
|
567
|
+
"""
|
|
568
|
+
guide_items = []
|
|
569
|
+
|
|
570
|
+
# Read from local directory
|
|
571
|
+
user_guide_dir = Path(__file__).parent.parent / "docs" / "user-guide"
|
|
572
|
+
|
|
573
|
+
if not user_guide_dir.exists():
|
|
574
|
+
return guide_items
|
|
575
|
+
|
|
576
|
+
# Get all .qmd files (excluding index.qmd)
|
|
577
|
+
qmd_files = sorted([f for f in user_guide_dir.glob("*.qmd") if f.name != "index.qmd"])
|
|
578
|
+
|
|
579
|
+
for qmd_file in qmd_files:
|
|
580
|
+
# Read the file to extract title
|
|
581
|
+
with open(qmd_file, "r") as f:
|
|
582
|
+
content = f.read()
|
|
583
|
+
|
|
584
|
+
# Try to extract title from YAML frontmatter
|
|
585
|
+
title_match = re.search(r'^title:\s*["\']?([^"\'\n]+)["\']?', content, re.MULTILINE)
|
|
586
|
+
if title_match:
|
|
587
|
+
title = title_match.group(1).strip()
|
|
588
|
+
else:
|
|
589
|
+
# Fallback to filename
|
|
590
|
+
title = qmd_file.stem.replace("-", " ").title()
|
|
591
|
+
|
|
592
|
+
# Try to extract first paragraph as description (optional)
|
|
593
|
+
# Skip code blocks and look for first real content
|
|
594
|
+
description = ""
|
|
595
|
+
|
|
596
|
+
url = urljoin(base_url, f"user-guide/{qmd_file.stem}.html")
|
|
597
|
+
|
|
598
|
+
guide_items.append({"title": title, "description": description, "url": url})
|
|
599
|
+
|
|
600
|
+
return guide_items
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def generate_llms_full_txt(output_path: Optional[str] = None) -> str:
|
|
604
|
+
"""
|
|
605
|
+
Generate the llms-full.txt content using the existing api-docs.txt file or by generating
|
|
606
|
+
the API and examples text.
|
|
607
|
+
|
|
608
|
+
Parameters
|
|
609
|
+
----------
|
|
610
|
+
output_path : str, optional
|
|
611
|
+
Path to save the generated content. If None, content is returned but not saved.
|
|
612
|
+
|
|
613
|
+
Returns
|
|
614
|
+
-------
|
|
615
|
+
str
|
|
616
|
+
The llms-full.txt formatted content.
|
|
617
|
+
"""
|
|
618
|
+
# Try to use existing api-docs.txt first
|
|
619
|
+
api_docs_path = Path(__file__).parent / "data" / "api-docs.txt"
|
|
620
|
+
|
|
621
|
+
if api_docs_path.exists():
|
|
622
|
+
with open(api_docs_path, "r") as f:
|
|
623
|
+
content = f.read()
|
|
624
|
+
else:
|
|
625
|
+
# Generate the content
|
|
626
|
+
content = _get_api_and_examples_text()
|
|
627
|
+
|
|
628
|
+
if output_path:
|
|
629
|
+
with open(output_path, "w") as f:
|
|
630
|
+
f.write(content)
|
|
631
|
+
|
|
632
|
+
return content
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def main():
|
|
636
|
+
"""
|
|
637
|
+
Main function to generate both llms.txt and llms-full.txt files.
|
|
638
|
+
"""
|
|
639
|
+
# Generate llms.txt
|
|
640
|
+
print("Generating llms.txt...")
|
|
641
|
+
try:
|
|
642
|
+
llms_content = generate_llms_txt()
|
|
643
|
+
llms_path = Path(__file__).parent.parent / "docs" / "llms.txt"
|
|
644
|
+
with open(llms_path, "w") as f:
|
|
645
|
+
f.write(llms_content)
|
|
646
|
+
print(f"✓ Generated {llms_path}")
|
|
647
|
+
except Exception as e:
|
|
648
|
+
print(f"✗ Failed to generate llms.txt: {e}")
|
|
649
|
+
|
|
650
|
+
# Generate llms-full.txt
|
|
651
|
+
print("\nGenerating llms-full.txt...")
|
|
652
|
+
try:
|
|
653
|
+
llms_full_path = Path(__file__).parent.parent / "docs" / "llms-full.txt"
|
|
654
|
+
generate_llms_full_txt(str(llms_full_path))
|
|
655
|
+
print(f"✓ Generated {llms_full_path}")
|
|
656
|
+
except Exception as e:
|
|
657
|
+
print(f"✗ Failed to generate llms-full.txt: {e}")
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
if __name__ == "__main__":
|
|
661
|
+
main()
|