dstpr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstpr-0.1.0/LICENSE +21 -0
- dstpr-0.1.0/PKG-INFO +134 -0
- dstpr-0.1.0/README.md +94 -0
- dstpr-0.1.0/dstpr/__init__.py +13 -0
- dstpr-0.1.0/dstpr/pipeline.py +86 -0
- dstpr-0.1.0/dstpr/profiles.py +64 -0
- dstpr-0.1.0/dstpr.egg-info/PKG-INFO +134 -0
- dstpr-0.1.0/dstpr.egg-info/SOURCES.txt +12 -0
- dstpr-0.1.0/dstpr.egg-info/dependency_links.txt +1 -0
- dstpr-0.1.0/dstpr.egg-info/entry_points.txt +2 -0
- dstpr-0.1.0/dstpr.egg-info/requires.txt +5 -0
- dstpr-0.1.0/dstpr.egg-info/top_level.txt +1 -0
- dstpr-0.1.0/pyproject.toml +36 -0
- dstpr-0.1.0/setup.cfg +4 -0
dstpr-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Stephen Meisenbacher
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dstpr-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dstpr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A high-throughput, domain-specific text preprocessing cascading pipeline filter to rank core sentences in texts and filter away boilerplate.
|
|
5
|
+
Author-email: Stephen Meisenbacher <sjmeis@gtgd.com>
|
|
6
|
+
Maintainer-email: Stephen Meisenbacher <sjmeis@gtgd.com>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 Stephen Meisenbacher
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
Classifier: Programming Language :: Python :: 3
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Classifier: Operating System :: OS Independent
|
|
31
|
+
Requires-Python: >=3.10
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
License-File: LICENSE
|
|
34
|
+
Requires-Dist: sentence-transformers>=3.0.0
|
|
35
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
36
|
+
Requires-Dist: numpy>=1.22.0
|
|
37
|
+
Requires-Dist: datasets>=2.12.0
|
|
38
|
+
Requires-Dist: spacy<4.0.0,>=3.8.0
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# DSTPR: Domain-specific Text Pre-processing and Ranking
|
|
42
|
+
|
|
43
|
+
[](https://pypi.org/project/dstpr/)
|
|
44
|
+
[](https://github.com/sjmeis/DSTPR/blob/main/LICENSE)
|
|
45
|
+
|
|
46
|
+
A high-throughput text preprocessing pipeline designed to filter, segment, and rank core sentences from noisy, plaintext documents.
|
|
47
|
+
|
|
48
|
+
This package strips away boilerplate, disclaimers, and application instructions *before* moving this data onto to heavier processing pipelines.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Pipeline Architecture
|
|
53
|
+
|
|
54
|
+
Instead of running heavy transformer models over every sentence in a document, `DSTPR` pipes documents through progressively stricter layers:
|
|
55
|
+
|
|
56
|
+
1. **Heuristic Cleansing & Tokenization:** Uses `PySBD` (Python Sentence Boundary Disambiguation) paired with regular expressions to fix punctuation caused by flattening, then filters out sentences containing stop phrases.
|
|
57
|
+
2. **Semantic Section Routing:** Utilizes a lightweight encoder (Default: `all-MiniLM-L6-v2`) to dynamically find structural transitions in input texts.
|
|
58
|
+
3. **Hybrid Ranking:** Scores remaining sentences using a combination of **Semantics** (cosine similarity to anchors) and **Lexical Syntax** (regex-supported detection).
|
|
59
|
+
4. **Parallel Execution Engine:** Wraps the entire pipeline inside efficient datasets to allow for batch processing at scale, with CPU or GPU.
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
To install the package, simply run
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install dstpr
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Usage
|
|
72
|
+
|
|
73
|
+
### High-Throughput Batch Processing
|
|
74
|
+
|
|
75
|
+
This is the recommended approach for large-scale data pipelines.
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from dstpr import ParallelPreprocessingPipeline, JOB_POSTING_PROFILE
|
|
79
|
+
|
|
80
|
+
pipeline = ParallelPreprocessingPipeline(profile=JOB_POSTING_PROFILE, batch_size=64)
|
|
81
|
+
|
|
82
|
+
raw_documents = [
|
|
83
|
+
"DOC 1",
|
|
84
|
+
"DOC 2",
|
|
85
|
+
"..."
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
cleaned_documents = pipeline.process(raw_documents, num_workers=8, threshold=0.25)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Advanced Usage
|
|
92
|
+
|
|
93
|
+
If you want to integrate specific pipeline layers directly into an existing workflow, or tweak the internal parameters, you can import individual modules manually:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from dstpr.cleaners import clean_and_split_chunks
|
|
97
|
+
from dstpr.segmenters import SemanticSectionRouter
|
|
98
|
+
from dstpr.rankers import HybridTaskRanker
|
|
99
|
+
from sentence_transformers import Transformer, SentenceTransformer
|
|
100
|
+
|
|
101
|
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
102
|
+
|
|
103
|
+
text = "Job posting text goes here!"
|
|
104
|
+
|
|
105
|
+
# Clean and segment sentences
|
|
106
|
+
sentences = clean_and_split_chunks(text)
|
|
107
|
+
|
|
108
|
+
# Route by context
|
|
109
|
+
router = SemanticSectionRouter(model=embedding_model)
|
|
110
|
+
buckets = router.route_sentences(sentences)
|
|
111
|
+
target_sentences = buckets['CORE'] + buckets['REQUIREMENTS']
|
|
112
|
+
|
|
113
|
+
# Grade and sort text features
|
|
114
|
+
ranker = HybridRanker(model=shared_model)
|
|
115
|
+
final = ranker.rank_and_filter(target_sentences, bi_cutoff_pct=0.25, final_threshold=0.40)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Parameter Tuning
|
|
119
|
+
|
|
120
|
+
To adjust the trade-off between strict filtering and execution speed, consider tweaking these variables in `ParallelPreprocessingPipeline`:
|
|
121
|
+
- `threshold` (Default: `0.4`): Controls how aggressively sentences are discarded. Raising this value toward `0.6`, for example, ensures only stronger matching sentences are passed through. Lowering it toward `0.3`, on the other hand, acts as a wider net.
|
|
122
|
+
- `batch_size` (Default: `256`): This depends on your hardware! Adjust for best performance.
|
|
123
|
+
|
|
124
|
+
## Creating Custom Domain Profiles (via the provided wizard)
|
|
125
|
+
|
|
126
|
+
A `DomainProfile` is required to use `DSTPR` (see the usage example). Out of the box, `dstpr` ships with pre-configured configurations for job postings (`JOB_POSTINGS_PROFILE`). However, you can easily generate an domain-specific pipeline for **any specific domain** using our built-in interactive configuration wizard.
|
|
127
|
+
|
|
128
|
+
To spin up the profile creation walkthrough, simply open your terminal and run:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
task-profile-wizard
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
All profiles generated via the terminal wizard are automatically validated and written out as JSON to a local cache: `~/.config/dstpr/profiles/`
|
dstpr-0.1.0/README.md
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# DSTPR: Domain-specific Text Pre-processing and Ranking
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/dstpr/)
|
|
4
|
+
[](https://github.com/sjmeis/DSTPR/blob/main/LICENSE)
|
|
5
|
+
|
|
6
|
+
A high-throughput text preprocessing pipeline designed to filter, segment, and rank core sentences from noisy, plaintext documents.
|
|
7
|
+
|
|
8
|
+
This package strips away boilerplate, disclaimers, and application instructions *before* moving this data onto to heavier processing pipelines.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Pipeline Architecture
|
|
13
|
+
|
|
14
|
+
Instead of running heavy transformer models over every sentence in a document, `DSTPR` pipes documents through progressively stricter layers:
|
|
15
|
+
|
|
16
|
+
1. **Heuristic Cleansing & Tokenization:** Uses `PySBD` (Python Sentence Boundary Disambiguation) paired with regular expressions to fix punctuation caused by flattening, then filters out sentences containing stop phrases.
|
|
17
|
+
2. **Semantic Section Routing:** Utilizes a lightweight encoder (Default: `all-MiniLM-L6-v2`) to dynamically find structural transitions in input texts.
|
|
18
|
+
3. **Hybrid Ranking:** Scores remaining sentences using a combination of **Semantics** (cosine similarity to anchors) and **Lexical Syntax** (regex-supported detection).
|
|
19
|
+
4. **Parallel Execution Engine:** Wraps the entire pipeline inside efficient datasets to allow for batch processing at scale, with CPU or GPU.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
To install the package, simply run
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install dstpr
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
### High-Throughput Batch Processing
|
|
34
|
+
|
|
35
|
+
This is the recommended approach for large-scale data pipelines.
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from dstpr import ParallelPreprocessingPipeline, JOB_POSTING_PROFILE
|
|
39
|
+
|
|
40
|
+
pipeline = ParallelPreprocessingPipeline(profile=JOB_POSTING_PROFILE, batch_size=64)
|
|
41
|
+
|
|
42
|
+
raw_documents = [
|
|
43
|
+
"DOC 1",
|
|
44
|
+
"DOC 2",
|
|
45
|
+
"..."
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
cleaned_documents = pipeline.process(raw_documents, num_workers=8, threshold=0.25)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Advanced Usage
|
|
52
|
+
|
|
53
|
+
If you want to integrate specific pipeline layers directly into an existing workflow, or tweak the internal parameters, you can import individual modules manually:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from dstpr.cleaners import clean_and_split_chunks
|
|
57
|
+
from dstpr.segmenters import SemanticSectionRouter
|
|
58
|
+
from dstpr.rankers import HybridTaskRanker
|
|
59
|
+
from sentence_transformers import Transformer, SentenceTransformer
|
|
60
|
+
|
|
61
|
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
62
|
+
|
|
63
|
+
text = "Job posting text goes here!"
|
|
64
|
+
|
|
65
|
+
# Clean and segment sentences
|
|
66
|
+
sentences = clean_and_split_chunks(text)
|
|
67
|
+
|
|
68
|
+
# Route by context
|
|
69
|
+
router = SemanticSectionRouter(model=embedding_model)
|
|
70
|
+
buckets = router.route_sentences(sentences)
|
|
71
|
+
target_sentences = buckets['CORE'] + buckets['REQUIREMENTS']
|
|
72
|
+
|
|
73
|
+
# Grade and sort text features
|
|
74
|
+
ranker = HybridRanker(model=shared_model)
|
|
75
|
+
final = ranker.rank_and_filter(target_sentences, bi_cutoff_pct=0.25, final_threshold=0.40)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Parameter Tuning
|
|
79
|
+
|
|
80
|
+
To adjust the trade-off between strict filtering and execution speed, consider tweaking these variables in `ParallelPreprocessingPipeline`:
|
|
81
|
+
- `threshold` (Default: `0.4`): Controls how aggressively sentences are discarded. Raising this value toward `0.6`, for example, ensures only stronger matching sentences are passed through. Lowering it toward `0.3`, on the other hand, acts as a wider net.
|
|
82
|
+
- `batch_size` (Default: `256`): This depends on your hardware! Adjust for best performance.
|
|
83
|
+
|
|
84
|
+
## Creating Custom Domain Profiles (via the provided wizard)
|
|
85
|
+
|
|
86
|
+
A `DomainProfile` is required to use `DSTPR` (see the usage example). Out of the box, `dstpr` ships with pre-configured configurations for job postings (`JOB_POSTINGS_PROFILE`). However, you can easily generate an domain-specific pipeline for **any specific domain** using our built-in interactive configuration wizard.
|
|
87
|
+
|
|
88
|
+
To spin up the profile creation walkthrough, simply open your terminal and run:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
task-profile-wizard
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
All profiles generated via the terminal wizard are automatically validated and written out as JSON to a local cache: `~/.config/dstpr/profiles/`
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
try:
|
|
3
|
+
import lzma
|
|
4
|
+
except ImportError:
|
|
5
|
+
from backports import lzma
|
|
6
|
+
sys.modules['lzma'] = lzma
|
|
7
|
+
|
|
8
|
+
from dstpr.pipeline import ParallelPreprocessingPipeline
|
|
9
|
+
from dstpr.profiles import DomainProfile, JOB_POSTING_PROFILE
|
|
10
|
+
from dstpr.utils.cli import run_profile_wizard, DEFAULT_STORAGE_DIR
|
|
11
|
+
from dstpr.utils.setup import setup_resources
|
|
12
|
+
|
|
13
|
+
__all__ = ["ParallelPreprocessingPipeline", "DomainProfile", "JOB_POSTING_PROFILE", "run_profile_wizard", "DEFAULT_STORAGE_DIR", "setup_resources"]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import torch
|
|
3
|
+
from datasets import Dataset
|
|
4
|
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
|
5
|
+
from dstpr.cleaners.pre_process import clean_and_split_chunks
|
|
6
|
+
from dstpr.cleaners.post_process import sanitize_outputs
|
|
7
|
+
from dstpr.segmenters.section_split import SemanticSectionRouter
|
|
8
|
+
from dstpr.rankers.hybrid import HybridRanker
|
|
9
|
+
from dstpr.profiles import DomainProfile
|
|
10
|
+
|
|
11
|
+
class ParallelPreprocessingPipeline:
|
|
12
|
+
def __init__(self, profile: DomainProfile, batch_size: int = 256, embedding_model: str = "all-MiniLM-L6-v2"):
|
|
13
|
+
self.profile = profile
|
|
14
|
+
self.batch_size = batch_size
|
|
15
|
+
self.embedding_model = embedding_model
|
|
16
|
+
|
|
17
|
+
self.router = None
|
|
18
|
+
self.ranker = None
|
|
19
|
+
self._is_initialized = False
|
|
20
|
+
|
|
21
|
+
def _lazy_init_local_resources(self):
|
|
22
|
+
|
|
23
|
+
if not self._is_initialized:
|
|
24
|
+
torch.set_num_threads(1)
|
|
25
|
+
|
|
26
|
+
bi_encoder = SentenceTransformer(self.embedding_model)
|
|
27
|
+
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
|
28
|
+
|
|
29
|
+
self.router = SemanticSectionRouter(profile=self.profile, model=bi_encoder)
|
|
30
|
+
self.ranker = HybridRanker(
|
|
31
|
+
profile=self.profile,
|
|
32
|
+
model=bi_encoder,
|
|
33
|
+
cross_encoder_model=cross_encoder
|
|
34
|
+
)
|
|
35
|
+
self._is_initialized = True
|
|
36
|
+
|
|
37
|
+
def _process_batch(self, batch: dict, threshold: float) -> dict:
|
|
38
|
+
self._lazy_init_local_resources()
|
|
39
|
+
|
|
40
|
+
cleaned_documents = []
|
|
41
|
+
for raw_text in batch["raw_text"]:
|
|
42
|
+
if not raw_text or not isinstance(raw_text, str):
|
|
43
|
+
cleaned_documents.append("")
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
sentences = clean_and_split_chunks(raw_text)
|
|
47
|
+
buckets = self.router.route_sentences(sentences)
|
|
48
|
+
print(buckets)
|
|
49
|
+
|
|
50
|
+
target_sentences = []
|
|
51
|
+
for section_key in self.router.profile.target_sections:
|
|
52
|
+
target_sentences.extend(buckets.get(section_key, []))
|
|
53
|
+
|
|
54
|
+
if not target_sentences:
|
|
55
|
+
for sec_content in buckets.values():
|
|
56
|
+
target_sentences.extend(sec_content)
|
|
57
|
+
|
|
58
|
+
filtered_tasks = self.ranker.rank_and_filter(target_sentences, final_threshold=threshold)
|
|
59
|
+
pp_outputs = sanitize_outputs(filtered_tasks)
|
|
60
|
+
cleaned_documents.append(pp_outputs)
|
|
61
|
+
|
|
62
|
+
return {"cleaned_text": cleaned_documents}
|
|
63
|
+
|
|
64
|
+
def process(self, texts: list[str], num_workers: int = None, threshold: float = 0.50) -> list[str]:
|
|
65
|
+
if num_workers is None:
|
|
66
|
+
num_workers = max(1, os.cpu_count() - 1)
|
|
67
|
+
|
|
68
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
69
|
+
|
|
70
|
+
print(f"Creating Dataset for {len(texts)} documents...")
|
|
71
|
+
dataset = Dataset.from_dict({"raw_text": texts})
|
|
72
|
+
|
|
73
|
+
print(f"Running pipeline using {num_workers} parallel worker(s)...")
|
|
74
|
+
|
|
75
|
+
processed_dataset = dataset.map(
|
|
76
|
+
self._process_batch,
|
|
77
|
+
batched=True,
|
|
78
|
+
batch_size=self.batch_size,
|
|
79
|
+
num_proc=num_workers,
|
|
80
|
+
fn_kwargs={
|
|
81
|
+
"threshold": threshold
|
|
82
|
+
},
|
|
83
|
+
desc="Filtering Documents"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return processed_dataset["cleaned_text"]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from dataclasses import dataclass, field, asdict
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class DomainProfile:
|
|
7
|
+
name: str
|
|
8
|
+
global_stop_phrases: set[str] = field(default_factory=set)
|
|
9
|
+
anchors: dict[str, str] = field(default_factory=dict)
|
|
10
|
+
default_section: str = "CORE"
|
|
11
|
+
target_sections: list[str] = field(default_factory=list)
|
|
12
|
+
target_semantic_anchor: str = "core actions and focus areas"
|
|
13
|
+
action_keywords_regex: str = r"\b(do|execute|perform)\b"
|
|
14
|
+
syntax_start_pattern: str = r"^.*"
|
|
15
|
+
|
|
16
|
+
def validate(self):
|
|
17
|
+
if not self.name.isalnum():
|
|
18
|
+
raise ValueError("Profile name must be alphanumeric (no spaces or special characters).")
|
|
19
|
+
if not self.section_anchors:
|
|
20
|
+
raise ValueError("DomainProfile must define at least one section anchor.")
|
|
21
|
+
if self.default_section not in self.anchors:
|
|
22
|
+
raise ValueError(f"Default section '{self.default_section}' must be one of the defined section anchors.")
|
|
23
|
+
for section in self.target_sections:
|
|
24
|
+
if section not in self.anchors:
|
|
25
|
+
raise ValueError(f"Target section '{section}' must match a key in section_anchors.")
|
|
26
|
+
|
|
27
|
+
def save_to_json(self, target_dir: Path | str) -> Path:
|
|
28
|
+
self.validate()
|
|
29
|
+
dir_path = Path(target_dir)
|
|
30
|
+
dir_path.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
file_path = dir_path / f"{self.name}.json"
|
|
33
|
+
|
|
34
|
+
data = asdict(self)
|
|
35
|
+
data['global_stop_phrases'] = list(data['global_stop_phrases'])
|
|
36
|
+
|
|
37
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
38
|
+
json.dump(data, f, indent=4)
|
|
39
|
+
|
|
40
|
+
return file_path
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def load_from_json(cls, file_path: Path | str) -> 'DomainProfile':
|
|
44
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
45
|
+
data = json.load(f)
|
|
46
|
+
|
|
47
|
+
data['global_stop_phrases'] = set(data['global_stop_phrases'])
|
|
48
|
+
return cls(**data)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
JOB_POSTING_PROFILE = DomainProfile(
|
|
52
|
+
name="job postings",
|
|
53
|
+
global_stop_phrases={"apply online", "click here", "equal opportunity employer", "visa sponsorship"},
|
|
54
|
+
anchors={
|
|
55
|
+
'CORE': "core job duties, daily responsibilities, tasks, project assignments, what you will build and execute on the job",
|
|
56
|
+
'REQUIREMENTS': "candidate qualifications, required experience, required skills, degrees, interview assessment process, hiring criteria",
|
|
57
|
+
'BOILERPLATE': "company overview, history, mission statement, office locations, perks and benefits, scam alerts, recruitment partners, privacy policies"
|
|
58
|
+
},
|
|
59
|
+
default_section="CORE",
|
|
60
|
+
target_sections=['CORE', 'REQUIREMENTS'],
|
|
61
|
+
target_semantic_anchor="execute responsibilities, perform duties, manage projects, build and design systems",
|
|
62
|
+
action_keywords_regex=r"\b(write|design|build|develop|manage|lead|collaborate|implement|maintain)\b",
|
|
63
|
+
syntax_start_pattern=r"^(?:you\s+will\s+|responsibilities\s+include\s+)?",
|
|
64
|
+
)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dstpr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A high-throughput, domain-specific text preprocessing cascading pipeline filter to rank core sentences in texts and filter away boilerplate.
|
|
5
|
+
Author-email: Stephen Meisenbacher <sjmeis@gtgd.com>
|
|
6
|
+
Maintainer-email: Stephen Meisenbacher <sjmeis@gtgd.com>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 Stephen Meisenbacher
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
Classifier: Programming Language :: Python :: 3
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Classifier: Operating System :: OS Independent
|
|
31
|
+
Requires-Python: >=3.10
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
License-File: LICENSE
|
|
34
|
+
Requires-Dist: sentence-transformers>=3.0.0
|
|
35
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
36
|
+
Requires-Dist: numpy>=1.22.0
|
|
37
|
+
Requires-Dist: datasets>=2.12.0
|
|
38
|
+
Requires-Dist: spacy<4.0.0,>=3.8.0
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# DSTPR: Domain-specific Text Pre-processing and Ranking
|
|
42
|
+
|
|
43
|
+
[](https://pypi.org/project/dstpr/)
|
|
44
|
+
[](https://github.com/sjmeis/DSTPR/blob/main/LICENSE)
|
|
45
|
+
|
|
46
|
+
A high-throughput text preprocessing pipeline designed to filter, segment, and rank core sentences from noisy, plaintext documents.
|
|
47
|
+
|
|
48
|
+
This package strips away boilerplate, disclaimers, and application instructions *before* moving this data onto to heavier processing pipelines.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Pipeline Architecture
|
|
53
|
+
|
|
54
|
+
Instead of running heavy transformer models over every sentence in a document, `DSTPR` pipes documents through progressively stricter layers:
|
|
55
|
+
|
|
56
|
+
1. **Heuristic Cleansing & Tokenization:** Uses `PySBD` (Python Sentence Boundary Disambiguation) paired with regular expressions to fix punctuation caused by flattening, then filters out sentences containing stop phrases.
|
|
57
|
+
2. **Semantic Section Routing:** Utilizes a lightweight encoder (Default: `all-MiniLM-L6-v2`) to dynamically find structural transitions in input texts.
|
|
58
|
+
3. **Hybrid Ranking:** Scores remaining sentences using a combination of **Semantics** (cosine similarity to anchors) and **Lexical Syntax** (regex-supported detection).
|
|
59
|
+
4. **Parallel Execution Engine:** Wraps the entire pipeline inside efficient datasets to allow for batch processing at scale, with CPU or GPU.
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
To install the package, simply run
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install dstpr
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Usage
|
|
72
|
+
|
|
73
|
+
### High-Throughput Batch Processing
|
|
74
|
+
|
|
75
|
+
This is the recommended approach for large-scale data pipelines.
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from dstpr import ParallelPreprocessingPipeline, JOB_POSTING_PROFILE
|
|
79
|
+
|
|
80
|
+
pipeline = ParallelPreprocessingPipeline(profile=JOB_POSTING_PROFILE, batch_size=64)
|
|
81
|
+
|
|
82
|
+
raw_documents = [
|
|
83
|
+
"DOC 1",
|
|
84
|
+
"DOC 2",
|
|
85
|
+
"..."
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
cleaned_documents = pipeline.process(raw_documents, num_workers=8, threshold=0.25)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Advanced Usage
|
|
92
|
+
|
|
93
|
+
If you want to integrate specific pipeline layers directly into an existing workflow, or tweak the internal parameters, you can import individual modules manually:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from dstpr.cleaners import clean_and_split_chunks
|
|
97
|
+
from dstpr.segmenters import SemanticSectionRouter
|
|
98
|
+
from dstpr.rankers import HybridTaskRanker
|
|
99
|
+
from sentence_transformers import Transformer, SentenceTransformer
|
|
100
|
+
|
|
101
|
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
102
|
+
|
|
103
|
+
text = "Job posting text goes here!"
|
|
104
|
+
|
|
105
|
+
# Clean and segment sentences
|
|
106
|
+
sentences = clean_and_split_chunks(text)
|
|
107
|
+
|
|
108
|
+
# Route by context
|
|
109
|
+
router = SemanticSectionRouter(model=embedding_model)
|
|
110
|
+
buckets = router.route_sentences(sentences)
|
|
111
|
+
target_sentences = buckets['CORE'] + buckets['REQUIREMENTS']
|
|
112
|
+
|
|
113
|
+
# Grade and sort text features
|
|
114
|
+
ranker = HybridRanker(model=shared_model)
|
|
115
|
+
final = ranker.rank_and_filter(target_sentences, bi_cutoff_pct=0.25, final_threshold=0.40)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Parameter Tuning
|
|
119
|
+
|
|
120
|
+
To adjust the trade-off between strict filtering and execution speed, consider tweaking these variables in `ParallelPreprocessingPipeline`:
|
|
121
|
+
- `threshold` (Default: `0.4`): Controls how aggressively sentences are discarded. Raising this value toward `0.6`, for example, ensures only stronger matching sentences are passed through. Lowering it toward `0.3`, on the other hand, acts as a wider net.
|
|
122
|
+
- `batch_size` (Default: `256`): This depends on your hardware! Adjust for best performance.
|
|
123
|
+
|
|
124
|
+
## Creating Custom Domain Profiles (via the provided wizard)
|
|
125
|
+
|
|
126
|
+
A `DomainProfile` is required to use `DSTPR` (see the usage example). Out of the box, `dstpr` ships with pre-configured configurations for job postings (`JOB_POSTINGS_PROFILE`). However, you can easily generate an domain-specific pipeline for **any specific domain** using our built-in interactive configuration wizard.
|
|
127
|
+
|
|
128
|
+
To spin up the profile creation walkthrough, simply open your terminal and run:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
task-profile-wizard
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
All profiles generated via the terminal wizard are automatically validated and written out as JSON to a local cache: `~/.config/dstpr/profiles/`
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
dstpr/__init__.py
|
|
5
|
+
dstpr/pipeline.py
|
|
6
|
+
dstpr/profiles.py
|
|
7
|
+
dstpr.egg-info/PKG-INFO
|
|
8
|
+
dstpr.egg-info/SOURCES.txt
|
|
9
|
+
dstpr.egg-info/dependency_links.txt
|
|
10
|
+
dstpr.egg-info/entry_points.txt
|
|
11
|
+
dstpr.egg-info/requires.txt
|
|
12
|
+
dstpr.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dstpr
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dstpr"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A high-throughput, domain-specific text preprocessing cascading pipeline filter to rank core sentences in texts and filter away boilerplate."
|
|
9
|
+
readme = {file = "README.md", content-type = "text/markdown"}
|
|
10
|
+
license = {file = "LICENSE"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Stephen Meisenbacher", email = "sjmeis@gtgd.com"},
|
|
13
|
+
]
|
|
14
|
+
maintainers = [
|
|
15
|
+
{name = "Stephen Meisenbacher", email = "sjmeis@gtgd.com"}
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.10"
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent"
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"sentence-transformers>=3.0.0",
|
|
25
|
+
"scikit-learn>=1.0.0",
|
|
26
|
+
"numpy>=1.22.0",
|
|
27
|
+
"datasets>=2.12.0",
|
|
28
|
+
"spacy>=3.8.0,<4.0.0"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[tool.setuptools.packages.find]
|
|
32
|
+
where = ["."]
|
|
33
|
+
include = ["dstpr"]
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
profile-wizard = "dstpr.utils.cli:run_profile_wizard"
|
dstpr-0.1.0/setup.cfg
ADDED