lens-qda 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lens_qda-0.2.0/LICENSE +27 -0
- lens_qda-0.2.0/PKG-INFO +95 -0
- lens_qda-0.2.0/README.md +68 -0
- lens_qda-0.2.0/lens_qda/__init__.py +11 -0
- lens_qda-0.2.0/lens_qda/cli.py +240 -0
- lens_qda-0.2.0/lens_qda.egg-info/PKG-INFO +95 -0
- lens_qda-0.2.0/lens_qda.egg-info/SOURCES.txt +11 -0
- lens_qda-0.2.0/lens_qda.egg-info/dependency_links.txt +1 -0
- lens_qda-0.2.0/lens_qda.egg-info/entry_points.txt +2 -0
- lens_qda-0.2.0/lens_qda.egg-info/requires.txt +1 -0
- lens_qda-0.2.0/lens_qda.egg-info/top_level.txt +1 -0
- lens_qda-0.2.0/pyproject.toml +64 -0
- lens_qda-0.2.0/setup.cfg +4 -0
lens_qda-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
> Mirror of the project-root `/LICENSE`. Copy-edits should land in both files
|
|
4
|
+
> (or replace this with `ln -s ../../LICENSE python/LICENSE` so there is one
|
|
5
|
+
> source of truth).
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 Mark Bouck
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
lens_qda-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lens-qda
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python utilities for LENS, a local-first qualitative data analysis (QDA) tool.
|
|
5
|
+
Author: LENS contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mabo-du/lens
|
|
8
|
+
Project-URL: Source, https://github.com/mabo-du/lens/tree/main/python
|
|
9
|
+
Project-URL: Issues, https://github.com/mabo-du/lens/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/mabo-du/lens/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: qda,qualitative,research,pdf,text-mining
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pdfplumber==0.11.4
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# lens-qda
|
|
29
|
+
|
|
30
|
+
Python utilities for **[LENS](https://github.com/mabo-du/lens)**, a local-first
|
|
31
|
+
qualitative data analysis (QDA) desktop application.
|
|
32
|
+
|
|
33
|
+
This package bundles the same PDF text-extraction pipeline that the LENS
|
|
34
|
+
desktop app uses to ingest PDF documents, exposing it as a small CLI so it can
|
|
35
|
+
also be used directly from Python or from shell scripts.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install lens-qda
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Requires Python 3.8+ and the prebuilt wheels for `pdfplumber` and its
|
|
44
|
+
dependencies (`cryptography`, `pillow`, `pdfminer.six`, ...) on PyPI; no
|
|
45
|
+
compiler is needed on supported platforms.
|
|
46
|
+
|
|
47
|
+
## CLI usage
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# Print plain text extracted from a PDF (one paragraph per page):
|
|
51
|
+
lens-qda extract path/to/paper.pdf
|
|
52
|
+
|
|
53
|
+
# Emit the same JSON envelope the LENS desktop sidecar produces:
|
|
54
|
+
lens-qda extract paper.pdf --json
|
|
55
|
+
|
|
56
|
+
# Save the extracted text to a file:
|
|
57
|
+
lens-qda extract paper.pdf -o paper.txt
|
|
58
|
+
|
|
59
|
+
# Tune pdfplumber's tolerances (defaults match the sidecar):
|
|
60
|
+
lens-qda extract paper.pdf --x-tolerance 3 --y-tolerance 3
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The `--json` schema matches the contract the LENS Tauri sidecar already
|
|
64
|
+
implements:
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{ "success": true, "text": "...all pages, joined by blank lines..." }
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
On failure:
|
|
71
|
+
|
|
72
|
+
```json
|
|
73
|
+
{ "success": false, "error": "<exception message>" }
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
(the process exits with status 1 in that case).
|
|
77
|
+
|
|
78
|
+
## Programmatic usage
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from pathlib import Path
|
|
82
|
+
import json, subprocess
|
|
83
|
+
|
|
84
|
+
result = subprocess.run(
|
|
85
|
+
["lens-qda", "extract", "paper.pdf", "--json"],
|
|
86
|
+
capture_output=True, text=True, check=True,
|
|
87
|
+
)
|
|
88
|
+
envelope = json.loads(result.stdout)
|
|
89
|
+
assert envelope["success"], envelope["error"]
|
|
90
|
+
corpus = envelope["text"]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT — same as the parent [LENS](https://github.com/mabo-du/lens) project.
|
lens_qda-0.2.0/README.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# lens-qda
|
|
2
|
+
|
|
3
|
+
Python utilities for **[LENS](https://github.com/mabo-du/lens)**, a local-first
|
|
4
|
+
qualitative data analysis (QDA) desktop application.
|
|
5
|
+
|
|
6
|
+
This package bundles the same PDF text-extraction pipeline that the LENS
|
|
7
|
+
desktop app uses to ingest PDF documents, exposing it as a small CLI so it can
|
|
8
|
+
also be used directly from Python or from shell scripts.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install lens-qda
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Requires Python 3.8+ and the prebuilt wheels for `pdfplumber` and its
|
|
17
|
+
dependencies (`cryptography`, `pillow`, `pdfminer.six`, ...) on PyPI; no
|
|
18
|
+
compiler is needed on supported platforms.
|
|
19
|
+
|
|
20
|
+
## CLI usage
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Print plain text extracted from a PDF (one paragraph per page):
|
|
24
|
+
lens-qda extract path/to/paper.pdf
|
|
25
|
+
|
|
26
|
+
# Emit the same JSON envelope the LENS desktop sidecar produces:
|
|
27
|
+
lens-qda extract paper.pdf --json
|
|
28
|
+
|
|
29
|
+
# Save the extracted text to a file:
|
|
30
|
+
lens-qda extract paper.pdf -o paper.txt
|
|
31
|
+
|
|
32
|
+
# Tune pdfplumber's tolerances (defaults match the sidecar):
|
|
33
|
+
lens-qda extract paper.pdf --x-tolerance 3 --y-tolerance 3
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The `--json` schema matches the contract the LENS Tauri sidecar already
|
|
37
|
+
implements:
|
|
38
|
+
|
|
39
|
+
```json
|
|
40
|
+
{ "success": true, "text": "...all pages, joined by blank lines..." }
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
On failure:
|
|
44
|
+
|
|
45
|
+
```json
|
|
46
|
+
{ "success": false, "error": "<exception message>" }
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
(the process exits with status 1 in that case).
|
|
50
|
+
|
|
51
|
+
## Programmatic usage
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from pathlib import Path
|
|
55
|
+
import json, subprocess
|
|
56
|
+
|
|
57
|
+
result = subprocess.run(
|
|
58
|
+
["lens-qda", "extract", "paper.pdf", "--json"],
|
|
59
|
+
capture_output=True, text=True, check=True,
|
|
60
|
+
)
|
|
61
|
+
envelope = json.loads(result.stdout)
|
|
62
|
+
assert envelope["success"], envelope["error"]
|
|
63
|
+
corpus = envelope["text"]
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## License
|
|
67
|
+
|
|
68
|
+
MIT — same as the parent [LENS](https://github.com/mabo-du/lens) project.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""lens-qda: Python utilities for the LENS qualitative data analysis tool.
|
|
2
|
+
|
|
3
|
+
The version string is set at build time from the git tag by
|
|
4
|
+
``.github/workflows/release.yml`` so PyPI releases stay in lockstep with the
|
|
5
|
+
desktop release.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
__version__ = "0.2.0"
|
|
11
|
+
__all__ = ["__version__"]
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""Command-line interface for ``lens-qda``.
|
|
2
|
+
|
|
3
|
+
Mirrors the JSON-envelope contract of the Tauri PDF sidecar
|
|
4
|
+
(``src-tauri/sidecars/pdfplumber/extract.py``) so the same Rust parser can
|
|
5
|
+
consume output from the ``lens-qda extract --json`` invocation and from the
|
|
6
|
+
bundled executable.
|
|
7
|
+
|
|
8
|
+
Entry point declared in ``python/pyproject.toml``::
|
|
9
|
+
|
|
10
|
+
[project.scripts]
|
|
11
|
+
lens-qda = "lens_qda.cli:main"
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Optional, Sequence
|
|
21
|
+
|
|
22
|
+
from lens_qda import __version__
|
|
23
|
+
|
|
24
|
+
# Default tolerances must match src-tauri/sidecars/pdfplumber/extract.py
|
|
25
|
+
# (the canonical Tauri sidecar) so the two code paths produce identical text.
|
|
26
|
+
DEFAULT_X_TOLERANCE = 3
|
|
27
|
+
DEFAULT_Y_TOLERANCE = 3
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _extract_pages(
|
|
31
|
+
pdf_path: Path, *, x_tolerance: int, y_tolerance: int
|
|
32
|
+
) -> list[str]:
|
|
33
|
+
# Imported lazily so `lens-qda --version` works without pdfplumber installed.
|
|
34
|
+
import pdfplumber # type: ignore
|
|
35
|
+
|
|
36
|
+
chunks: list[str] = []
|
|
37
|
+
with pdfplumber.open(str(pdf_path)) as pdf:
|
|
38
|
+
for page in pdf.pages:
|
|
39
|
+
text = page.extract_text(
|
|
40
|
+
x_tolerance=x_tolerance, y_tolerance=y_tolerance
|
|
41
|
+
)
|
|
42
|
+
if text is None:
|
|
43
|
+
# Fallback matching the Tauri sidecar's resilience logic.
|
|
44
|
+
text = page.extract_text_simple()
|
|
45
|
+
if text:
|
|
46
|
+
chunks.append(text)
|
|
47
|
+
return chunks
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def extract_text(
|
|
51
|
+
pdf_path: Path,
|
|
52
|
+
*,
|
|
53
|
+
x_tolerance: int = DEFAULT_X_TOLERANCE,
|
|
54
|
+
y_tolerance: int = DEFAULT_Y_TOLERANCE,
|
|
55
|
+
) -> str:
|
|
56
|
+
"""Return plain text extracted from *pdf_path*, pages joined by blank lines."""
|
|
57
|
+
return "\n\n".join(
|
|
58
|
+
_extract_pages(pdf_path, x_tolerance=x_tolerance, y_tolerance=y_tolerance)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def extract_json(
|
|
63
|
+
pdf_path: Path,
|
|
64
|
+
*,
|
|
65
|
+
x_tolerance: int = DEFAULT_X_TOLERANCE,
|
|
66
|
+
y_tolerance: int = DEFAULT_Y_TOLERANCE,
|
|
67
|
+
) -> dict[str, object]:
|
|
68
|
+
"""Return the JSON envelope the Tauri sidecar contract uses."""
|
|
69
|
+
try:
|
|
70
|
+
return {
|
|
71
|
+
"success": True,
|
|
72
|
+
"text": "\n\n".join(
|
|
73
|
+
_extract_pages(
|
|
74
|
+
pdf_path,
|
|
75
|
+
x_tolerance=x_tolerance,
|
|
76
|
+
y_tolerance=y_tolerance,
|
|
77
|
+
)
|
|
78
|
+
),
|
|
79
|
+
}
|
|
80
|
+
except Exception as exc: # noqa: BLE001 — envelope must capture any failure
|
|
81
|
+
return {"success": False, "error": str(exc)}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _emit(text: str, output: Optional[Path]) -> None:
|
|
85
|
+
if output is None:
|
|
86
|
+
sys.stdout.write(text)
|
|
87
|
+
if not text.endswith("\n"):
|
|
88
|
+
sys.stdout.write("\n")
|
|
89
|
+
return
|
|
90
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
output.write_text(text, encoding="utf-8")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _emit_envelope(envelope: dict[str, object], output: Optional[Path]) -> None:
|
|
95
|
+
"""Emit the {success, text|error} JSON envelope consistently.
|
|
96
|
+
|
|
97
|
+
Always rendered with ``ensure_ascii=False`` so non-ASCII PDF text
|
|
98
|
+
round-trips identically in every CLI branch.
|
|
99
|
+
"""
|
|
100
|
+
_emit(json.dumps(envelope, ensure_ascii=False), output)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def cmd_extract(args: argparse.Namespace) -> int:
|
|
104
|
+
path = Path(args.path)
|
|
105
|
+
|
|
106
|
+
# Distinguish a missing path from a non-regular-file path so the JSON
|
|
107
|
+
# envelope (and the stderr message) actually matches the failure cause.
|
|
108
|
+
if not path.exists():
|
|
109
|
+
msg = f"file not found: {path}"
|
|
110
|
+
if args.json:
|
|
111
|
+
_emit_envelope({"success": False, "error": msg}, args.output)
|
|
112
|
+
else:
|
|
113
|
+
sys.stderr.write(f"lens-qda: {msg}\n")
|
|
114
|
+
return 1
|
|
115
|
+
if not path.is_file():
|
|
116
|
+
msg = f"not a regular file: {path}"
|
|
117
|
+
if args.json:
|
|
118
|
+
_emit_envelope({"success": False, "error": msg}, args.output)
|
|
119
|
+
else:
|
|
120
|
+
sys.stderr.write(f"lens-qda: {msg}\n")
|
|
121
|
+
return 1
|
|
122
|
+
|
|
123
|
+
# ``extract_json`` always returns a dict (it catches every pdfplumber
|
|
124
|
+
# failure internally and converts it to ``{"success": False, "error": ...}``)
|
|
125
|
+
# so there is no exception to handle here — only the success-flag branch.
|
|
126
|
+
envelope = extract_json(
|
|
127
|
+
path, x_tolerance=args.x_tolerance, y_tolerance=args.y_tolerance
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if not envelope["success"]:
|
|
131
|
+
if args.json:
|
|
132
|
+
_emit_envelope(envelope, args.output)
|
|
133
|
+
else:
|
|
134
|
+
sys.stderr.write(f"lens-qda: {envelope['error']}\n")
|
|
135
|
+
return 1
|
|
136
|
+
|
|
137
|
+
if args.json:
|
|
138
|
+
_emit_envelope(envelope, args.output)
|
|
139
|
+
else:
|
|
140
|
+
_emit(str(envelope["text"]), args.output)
|
|
141
|
+
return 0
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def cmd_version(args: argparse.Namespace) -> int:
|
|
145
|
+
# ``args.output`` is only present when the ``version`` *subparser* was used;
|
|
146
|
+
# ``lens-qda --version`` (top-level) does not populate it. ``getattr`` keeps
|
|
147
|
+
# both paths working without crashing on missing attributes.
|
|
148
|
+
_emit(__version__, getattr(args, "output", None))
|
|
149
|
+
return 0
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
153
|
+
parser = argparse.ArgumentParser(
|
|
154
|
+
prog="lens-qda",
|
|
155
|
+
description=(
|
|
156
|
+
"Python utilities for LENS, a local-first qualitative data analysis "
|
|
157
|
+
"(QDA) tool. Currently exposes a PDF text-extraction command that "
|
|
158
|
+
"matches the LENS desktop app's bundled sidecar."
|
|
159
|
+
),
|
|
160
|
+
)
|
|
161
|
+
parser.add_argument(
|
|
162
|
+
"-V",
|
|
163
|
+
"--version",
|
|
164
|
+
action="store_true",
|
|
165
|
+
help="print the lens-qda version and exit",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Top-level defaults ensure `args.output` is always defined on the resulting
|
|
169
|
+
# Namespace so cmd_version / cmd_extract can safely read it regardless of
|
|
170
|
+
# whether the top-level parser or a subparser produced the namespace.
|
|
171
|
+
parser.set_defaults(output=None, version=False)
|
|
172
|
+
|
|
173
|
+
sub = parser.add_subparsers(dest="cmd", metavar="COMMAND")
|
|
174
|
+
|
|
175
|
+
p_extract = sub.add_parser(
|
|
176
|
+
"extract",
|
|
177
|
+
aliases=["x"],
|
|
178
|
+
help="extract text from a PDF document",
|
|
179
|
+
description=(
|
|
180
|
+
"Extract text from a PDF using pdfplumber, with tolerance settings "
|
|
181
|
+
"matching the LENS desktop sidecar. Print plain text by default; "
|
|
182
|
+
"pass --json to emit the {success, text|error} envelope consumed "
|
|
183
|
+
"by the Tauri Rust layer."
|
|
184
|
+
),
|
|
185
|
+
)
|
|
186
|
+
p_extract.add_argument("path", help="path to a PDF file")
|
|
187
|
+
p_extract.add_argument(
|
|
188
|
+
"-o",
|
|
189
|
+
"--output",
|
|
190
|
+
type=Path,
|
|
191
|
+
help="write output to this file instead of stdout",
|
|
192
|
+
)
|
|
193
|
+
p_extract.add_argument(
|
|
194
|
+
"-j",
|
|
195
|
+
"--json",
|
|
196
|
+
action="store_true",
|
|
197
|
+
help="emit the JSON envelope {success, text|error} matching the sidecar",
|
|
198
|
+
)
|
|
199
|
+
p_extract.add_argument(
|
|
200
|
+
"--x-tolerance",
|
|
201
|
+
type=int,
|
|
202
|
+
default=DEFAULT_X_TOLERANCE,
|
|
203
|
+
help=f"horizontal text-clustering tolerance (default: {DEFAULT_X_TOLERANCE})",
|
|
204
|
+
)
|
|
205
|
+
p_extract.add_argument(
|
|
206
|
+
"--y-tolerance",
|
|
207
|
+
type=int,
|
|
208
|
+
default=DEFAULT_Y_TOLERANCE,
|
|
209
|
+
help=f"vertical text-clustering tolerance (default: {DEFAULT_Y_TOLERANCE})",
|
|
210
|
+
)
|
|
211
|
+
p_extract.set_defaults(func=cmd_extract)
|
|
212
|
+
|
|
213
|
+
p_version = sub.add_parser(
|
|
214
|
+
"version", help="print lens-qda version and exit"
|
|
215
|
+
)
|
|
216
|
+
p_version.add_argument(
|
|
217
|
+
"-o", "--output", type=Path, help="write version to this file instead of stdout"
|
|
218
|
+
)
|
|
219
|
+
p_version.set_defaults(func=cmd_version)
|
|
220
|
+
|
|
221
|
+
return parser
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def main(argv: Optional[Sequence[str]] = None) -> int:
|
|
225
|
+
"""Console-script entry point declared in pyproject.toml."""
|
|
226
|
+
parser = build_parser()
|
|
227
|
+
args = parser.parse_args(argv)
|
|
228
|
+
|
|
229
|
+
if args.version:
|
|
230
|
+
return cmd_version(args)
|
|
231
|
+
|
|
232
|
+
func = getattr(args, "func", None)
|
|
233
|
+
if func is None:
|
|
234
|
+
parser.print_help(sys.stderr)
|
|
235
|
+
return 1
|
|
236
|
+
return func(args)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__": # pragma: no cover
|
|
240
|
+
sys.exit(main())
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lens-qda
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python utilities for LENS, a local-first qualitative data analysis (QDA) tool.
|
|
5
|
+
Author: LENS contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mabo-du/lens
|
|
8
|
+
Project-URL: Source, https://github.com/mabo-du/lens/tree/main/python
|
|
9
|
+
Project-URL: Issues, https://github.com/mabo-du/lens/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/mabo-du/lens/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: qda,qualitative,research,pdf,text-mining
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pdfplumber==0.11.4
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# lens-qda
|
|
29
|
+
|
|
30
|
+
Python utilities for **[LENS](https://github.com/mabo-du/lens)**, a local-first
|
|
31
|
+
qualitative data analysis (QDA) desktop application.
|
|
32
|
+
|
|
33
|
+
This package bundles the same PDF text-extraction pipeline that the LENS
|
|
34
|
+
desktop app uses to ingest PDF documents, exposing it as a small CLI so it can
|
|
35
|
+
also be used directly from Python or from shell scripts.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install lens-qda
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Requires Python 3.8+ and the prebuilt wheels for `pdfplumber` and its
|
|
44
|
+
dependencies (`cryptography`, `pillow`, `pdfminer.six`, ...) on PyPI; no
|
|
45
|
+
compiler is needed on supported platforms.
|
|
46
|
+
|
|
47
|
+
## CLI usage
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# Print plain text extracted from a PDF (one paragraph per page):
|
|
51
|
+
lens-qda extract path/to/paper.pdf
|
|
52
|
+
|
|
53
|
+
# Emit the same JSON envelope the LENS desktop sidecar produces:
|
|
54
|
+
lens-qda extract paper.pdf --json
|
|
55
|
+
|
|
56
|
+
# Save the extracted text to a file:
|
|
57
|
+
lens-qda extract paper.pdf -o paper.txt
|
|
58
|
+
|
|
59
|
+
# Tune pdfplumber's tolerances (defaults match the sidecar):
|
|
60
|
+
lens-qda extract paper.pdf --x-tolerance 3 --y-tolerance 3
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The `--json` schema matches the contract the LENS Tauri sidecar already
|
|
64
|
+
implements:
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{ "success": true, "text": "...all pages, joined by blank lines..." }
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
On failure:
|
|
71
|
+
|
|
72
|
+
```json
|
|
73
|
+
{ "success": false, "error": "<exception message>" }
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
(the process exits with status 1 in that case).
|
|
77
|
+
|
|
78
|
+
## Programmatic usage
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from pathlib import Path
|
|
82
|
+
import json, subprocess
|
|
83
|
+
|
|
84
|
+
result = subprocess.run(
|
|
85
|
+
["lens-qda", "extract", "paper.pdf", "--json"],
|
|
86
|
+
capture_output=True, text=True, check=True,
|
|
87
|
+
)
|
|
88
|
+
envelope = json.loads(result.stdout)
|
|
89
|
+
assert envelope["success"], envelope["error"]
|
|
90
|
+
corpus = envelope["text"]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT — same as the parent [LENS](https://github.com/mabo-du/lens) project.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
lens_qda/__init__.py
|
|
5
|
+
lens_qda/cli.py
|
|
6
|
+
lens_qda.egg-info/PKG-INFO
|
|
7
|
+
lens_qda.egg-info/SOURCES.txt
|
|
8
|
+
lens_qda.egg-info/dependency_links.txt
|
|
9
|
+
lens_qda.egg-info/entry_points.txt
|
|
10
|
+
lens_qda.egg-info/requires.txt
|
|
11
|
+
lens_qda.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pdfplumber==0.11.4
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
lens_qda
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
# setuptools>=69 so both the SPDX-string `license` field and the PEP 639
|
|
3
|
+
# `[project] license-files = [...]` declaration are accepted by the build
|
|
4
|
+
# validator.
|
|
5
|
+
requires = ["setuptools>=69", "wheel"]
|
|
6
|
+
build-backend = "setuptools.build_meta"
|
|
7
|
+
|
|
8
|
+
[project]
|
|
9
|
+
name = "lens-qda"
|
|
10
|
+
version = "0.2.0" # dynamically rewritten in .github/workflows/release.yml from $GITHUB_REF_NAME
|
|
11
|
+
description = "Python utilities for LENS, a local-first qualitative data analysis (QDA) tool."
|
|
12
|
+
# `file = …` reads the FILE CONTENTS as the project long description; `text = …`
|
|
13
|
+
# would set the literal 9-character string "README.md" (a footgun). PyPI will
|
|
14
|
+
# render this as the package homepage body, so keep it pointing at the file.
|
|
15
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
16
|
+
requires-python = ">=3.8"
|
|
17
|
+
# SPDX identifier form: universally supported by setuptools' validator across
|
|
18
|
+
# versions that recognise PEP 639. Table form ``license = {file = "LICENSE"}``
|
|
19
|
+
# requires a slightly newer validator and would make this package uninstallable
|
|
20
|
+
# on older toolchains. The full text lives in ``python/LICENSE`` (mirror of the
|
|
21
|
+
# project-root ``/LICENSE``; see the sync note in that file) and is listed
|
|
22
|
+
# below via ``license-files`` so PyPI serves the full MIT body alongside the
|
|
23
|
+
# SPDX badge.
|
|
24
|
+
license = "MIT"
|
|
25
|
+
license-files = ["LICENSE"]
|
|
26
|
+
|
|
27
|
+
keywords = ["qda", "qualitative", "research", "pdf", "text-mining"]
|
|
28
|
+
authors = [{ name = "LENS contributors" }]
|
|
29
|
+
classifiers = [
|
|
30
|
+
"Development Status :: 4 - Beta",
|
|
31
|
+
"Intended Audience :: Science/Research",
|
|
32
|
+
# No ``License :: ...`` classifier: PEP 639 forbids it when ``license``
|
|
33
|
+
# is an SPDX expression (which it is — see above).
|
|
34
|
+
"Programming Language :: Python :: 3",
|
|
35
|
+
"Programming Language :: Python :: 3.8",
|
|
36
|
+
"Programming Language :: Python :: 3.9",
|
|
37
|
+
"Programming Language :: Python :: 3.10",
|
|
38
|
+
"Programming Language :: Python :: 3.11",
|
|
39
|
+
"Programming Language :: Python :: 3.12",
|
|
40
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
41
|
+
"Topic :: Text Processing :: Markup",
|
|
42
|
+
]
|
|
43
|
+
dependencies = [
|
|
44
|
+
# Mirrors the canonical pdfplumber pin in
|
|
45
|
+
# src-tauri/sidecars/pdfplumber/requirements.txt (read by build.rs via the
|
|
46
|
+
# PDFPLUMBER_VERSION env var and by scripts/build-sidecar.sh). Bump BOTH
|
|
47
|
+
# consumers in lockstep so the bundled Tauri sidecar and this PyPI package
|
|
48
|
+
# always speak the same pdfplumber dialect.
|
|
49
|
+
"pdfplumber==0.11.4",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[project.urls]
|
|
53
|
+
"Homepage" = "https://github.com/mabo-du/lens"
|
|
54
|
+
"Source" = "https://github.com/mabo-du/lens/tree/main/python"
|
|
55
|
+
"Issues" = "https://github.com/mabo-du/lens/issues"
|
|
56
|
+
"Changelog" = "https://github.com/mabo-du/lens/blob/main/CHANGELOG.md"
|
|
57
|
+
|
|
58
|
+
[project.scripts]
|
|
59
|
+
lens-qda = "lens_qda.cli:main"
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.packages.find]
|
|
62
|
+
where = ["."]
|
|
63
|
+
include = ["lens_qda*"]
|
|
64
|
+
exclude = ["tests*"]
|
lens_qda-0.2.0/setup.cfg
ADDED