dstpr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dstpr-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stephen Meisenbacher
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
dstpr-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.4
2
+ Name: dstpr
3
+ Version: 0.1.0
4
+ Summary: A high-throughput, domain-specific text preprocessing cascading pipeline filter to rank core sentences in texts and filter away boilerplate.
5
+ Author-email: Stephen Meisenbacher <sjmeis@gtgd.com>
6
+ Maintainer-email: Stephen Meisenbacher <sjmeis@gtgd.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2026 Stephen Meisenbacher
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+ Classifier: Programming Language :: Python :: 3
29
+ Classifier: License :: OSI Approved :: MIT License
30
+ Classifier: Operating System :: OS Independent
31
+ Requires-Python: >=3.10
32
+ Description-Content-Type: text/markdown
33
+ License-File: LICENSE
34
+ Requires-Dist: sentence-transformers>=3.0.0
35
+ Requires-Dist: scikit-learn>=1.0.0
36
+ Requires-Dist: numpy>=1.22.0
37
+ Requires-Dist: datasets>=2.12.0
38
+ Requires-Dist: spacy<4.0.0,>=3.8.0
39
+ Dynamic: license-file
40
+
41
+ # DSTPR: Domain-specific Text Pre-processing and Ranking
42
+
43
+ [![PyPI version](https://img.shields.io/pypi/v/dstpr.svg)](https://pypi.org/project/dstpr/)
44
+ [![License](https://img.shields.io/github/license/sjmeis/DSTPR.svg)](https://github.com/sjmeis/DSTPR/blob/main/LICENSE)
45
+
46
+ A high-throughput text preprocessing pipeline designed to filter, segment, and rank core sentences from noisy, plaintext documents.
47
+
48
+ This package strips away boilerplate, disclaimers, and application instructions *before* moving this data onto to heavier processing pipelines.
49
+
50
+ ---
51
+
52
+ ## Pipeline Architecture
53
+
54
+ Instead of running heavy transformer models over every sentence in a document, `DSTPR` pipes documents through progressively stricter layers:
55
+
56
+ 1. **Heuristic Cleansing & Tokenization:** Uses `PySBD` (Python Sentence Boundary Disambiguation) paired with regular expressions to fix punctuation caused by flattening, then filters out sentences containing stop phrases.
57
+ 2. **Semantic Section Routing:** Utilizes a lightweight encoder (Default: `all-MiniLM-L6-v2`) to dynamically find structural transitions in input texts.
58
+ 3. **Hybrid Ranking:** Scores remaining sentences using a combination of **Semantics** (cosine similarity to anchors) and **Lexical Syntax** (regex-supported detection).
59
+ 4. **Parallel Execution Engine:** Wraps the entire pipeline inside efficient datasets to allow for batch processing at scale, with CPU or GPU.
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ To install the package, simply run
66
+
67
+ ```bash
68
+ pip install dstpr
69
+ ```
70
+
71
+ ## Usage
72
+
73
+ ### High-Throughput Batch Processing
74
+
75
+ This is the recommended approach for large-scale data pipelines.
76
+
77
+ ```python
78
+ from dstpr import ParallelPreprocessingPipeline, JOB_POSTING_PROFILE
79
+
80
+ pipeline = ParallelPreprocessingPipeline(profile=JOB_POSTING_PROFILE, batch_size=64)
81
+
82
+ raw_documents = [
83
+ "DOC 1",
84
+ "DOC 2",
85
+ "..."
86
+ ]
87
+
88
+ cleaned_documents = pipeline.process(raw_documents, num_workers=8, threshold=0.25)
89
+ ```
90
+
91
+ ### Advanced Usage
92
+
93
+ If you want to integrate specific pipeline layers directly into an existing workflow, or tweak the internal parameters, you can import individual modules manually:
94
+
95
+ ```python
96
+ from dstpr.cleaners import clean_and_split_chunks
97
+ from dstpr.segmenters import SemanticSectionRouter
98
+ from dstpr.rankers import HybridTaskRanker
99
+ from sentence_transformers import Transformer, SentenceTransformer
100
+
101
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
102
+
103
+ text = "Job posting text goes here!"
104
+
105
+ # Clean and segment sentences
106
+ sentences = clean_and_split_chunks(text)
107
+
108
+ # Route by context
109
+ router = SemanticSectionRouter(model=embedding_model)
110
+ buckets = router.route_sentences(sentences)
111
+ target_sentences = buckets['CORE'] + buckets['REQUIREMENTS']
112
+
113
+ # Grade and sort text features
114
+ ranker = HybridRanker(model=shared_model)
115
+ final = ranker.rank_and_filter(target_sentences, bi_cutoff_pct=0.25, final_threshold=0.40)
116
+ ```
117
+
118
+ ### Parameter Tuning
119
+
120
+ To adjust the trade-off between strict filtering and execution speed, consider tweaking these variables in `ParallelPreprocessingPipeline`:
121
+ - `threshold` (Default: `0.4`): Controls how aggressively sentences are discarded. Raising this value toward `0.6`, for example, ensures only stronger matching sentences are passed through. Lowering it toward `0.3`, on the other hand, acts as a wider net.
122
+ - `batch_size` (Default: `256`): This depends on your hardware! Adjust for best performance.
123
+
124
+ ## Creating Custom Domain Profiles (via the provided wizard)
125
+
126
+ A `DomainProfile` is required to use `DSTPR` (see the usage example). Out of the box, `dstpr` ships with pre-configured configurations for job postings (`JOB_POSTINGS_PROFILE`). However, you can easily generate an domain-specific pipeline for **any specific domain** using our built-in interactive configuration wizard.
127
+
128
+ To spin up the profile creation walkthrough, simply open your terminal and run:
129
+
130
+ ```bash
131
+ task-profile-wizard
132
+ ```
133
+
134
+ All profiles generated via the terminal wizard are automatically validated and written out as JSON to a local cache: `~/.config/dstpr/profiles/`
dstpr-0.1.0/README.md ADDED
@@ -0,0 +1,94 @@
1
+ # DSTPR: Domain-specific Text Pre-processing and Ranking
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/dstpr.svg)](https://pypi.org/project/dstpr/)
4
+ [![License](https://img.shields.io/github/license/sjmeis/DSTPR.svg)](https://github.com/sjmeis/DSTPR/blob/main/LICENSE)
5
+
6
+ A high-throughput text preprocessing pipeline designed to filter, segment, and rank core sentences from noisy, plaintext documents.
7
+
8
+ This package strips away boilerplate, disclaimers, and application instructions *before* moving this data onto to heavier processing pipelines.
9
+
10
+ ---
11
+
12
+ ## Pipeline Architecture
13
+
14
+ Instead of running heavy transformer models over every sentence in a document, `DSTPR` pipes documents through progressively stricter layers:
15
+
16
+ 1. **Heuristic Cleansing & Tokenization:** Uses `PySBD` (Python Sentence Boundary Disambiguation) paired with regular expressions to fix punctuation caused by flattening, then filters out sentences containing stop phrases.
17
+ 2. **Semantic Section Routing:** Utilizes a lightweight encoder (Default: `all-MiniLM-L6-v2`) to dynamically find structural transitions in input texts.
18
+ 3. **Hybrid Ranking:** Scores remaining sentences using a combination of **Semantics** (cosine similarity to anchors) and **Lexical Syntax** (regex-supported detection).
19
+ 4. **Parallel Execution Engine:** Wraps the entire pipeline inside efficient datasets to allow for batch processing at scale, with CPU or GPU.
20
+
21
+ ---
22
+
23
+ ## Installation
24
+
25
+ To install the package, simply run
26
+
27
+ ```bash
28
+ pip install dstpr
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ### High-Throughput Batch Processing
34
+
35
+ This is the recommended approach for large-scale data pipelines.
36
+
37
+ ```python
38
+ from dstpr import ParallelPreprocessingPipeline, JOB_POSTING_PROFILE
39
+
40
+ pipeline = ParallelPreprocessingPipeline(profile=JOB_POSTING_PROFILE, batch_size=64)
41
+
42
+ raw_documents = [
43
+ "DOC 1",
44
+ "DOC 2",
45
+ "..."
46
+ ]
47
+
48
+ cleaned_documents = pipeline.process(raw_documents, num_workers=8, threshold=0.25)
49
+ ```
50
+
51
+ ### Advanced Usage
52
+
53
+ If you want to integrate specific pipeline layers directly into an existing workflow, or tweak the internal parameters, you can import individual modules manually:
54
+
55
+ ```python
56
+ from dstpr.cleaners import clean_and_split_chunks
57
+ from dstpr.segmenters import SemanticSectionRouter
58
+ from dstpr.rankers import HybridTaskRanker
59
+ from sentence_transformers import Transformer, SentenceTransformer
60
+
61
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
62
+
63
+ text = "Job posting text goes here!"
64
+
65
+ # Clean and segment sentences
66
+ sentences = clean_and_split_chunks(text)
67
+
68
+ # Route by context
69
+ router = SemanticSectionRouter(model=embedding_model)
70
+ buckets = router.route_sentences(sentences)
71
+ target_sentences = buckets['CORE'] + buckets['REQUIREMENTS']
72
+
73
+ # Grade and sort text features
74
+ ranker = HybridRanker(model=shared_model)
75
+ final = ranker.rank_and_filter(target_sentences, bi_cutoff_pct=0.25, final_threshold=0.40)
76
+ ```
77
+
78
+ ### Parameter Tuning
79
+
80
+ To adjust the trade-off between strict filtering and execution speed, consider tweaking these variables in `ParallelPreprocessingPipeline`:
81
+ - `threshold` (Default: `0.4`): Controls how aggressively sentences are discarded. Raising this value toward `0.6`, for example, ensures only stronger matching sentences are passed through. Lowering it toward `0.3`, on the other hand, acts as a wider net.
82
+ - `batch_size` (Default: `256`): This depends on your hardware! Adjust for best performance.
83
+
84
+ ## Creating Custom Domain Profiles (via the provided wizard)
85
+
86
+ A `DomainProfile` is required to use `DSTPR` (see the usage example). Out of the box, `dstpr` ships with pre-configured configurations for job postings (`JOB_POSTINGS_PROFILE`). However, you can easily generate an domain-specific pipeline for **any specific domain** using our built-in interactive configuration wizard.
87
+
88
+ To spin up the profile creation walkthrough, simply open your terminal and run:
89
+
90
+ ```bash
91
+ task-profile-wizard
92
+ ```
93
+
94
+ All profiles generated via the terminal wizard are automatically validated and written out as JSON to a local cache: `~/.config/dstpr/profiles/`
@@ -0,0 +1,13 @@
1
+ import sys
2
+ try:
3
+ import lzma
4
+ except ImportError:
5
+ from backports import lzma
6
+ sys.modules['lzma'] = lzma
7
+
8
+ from dstpr.pipeline import ParallelPreprocessingPipeline
9
+ from dstpr.profiles import DomainProfile, JOB_POSTING_PROFILE
10
+ from dstpr.utils.cli import run_profile_wizard, DEFAULT_STORAGE_DIR
11
+ from dstpr.utils.setup import setup_resources
12
+
13
+ __all__ = ["ParallelPreprocessingPipeline", "DomainProfile", "JOB_POSTING_PROFILE", "run_profile_wizard", "DEFAULT_STORAGE_DIR", "setup_resources"]
@@ -0,0 +1,86 @@
1
+ import os
2
+ import torch
3
+ from datasets import Dataset
4
+ from sentence_transformers import SentenceTransformer, CrossEncoder
5
+ from dstpr.cleaners.pre_process import clean_and_split_chunks
6
+ from dstpr.cleaners.post_process import sanitize_outputs
7
+ from dstpr.segmenters.section_split import SemanticSectionRouter
8
+ from dstpr.rankers.hybrid import HybridRanker
9
+ from dstpr.profiles import DomainProfile
10
+
11
+ class ParallelPreprocessingPipeline:
12
+ def __init__(self, profile: DomainProfile, batch_size: int = 256, embedding_model: str = "all-MiniLM-L6-v2"):
13
+ self.profile = profile
14
+ self.batch_size = batch_size
15
+ self.embedding_model = embedding_model
16
+
17
+ self.router = None
18
+ self.ranker = None
19
+ self._is_initialized = False
20
+
21
+ def _lazy_init_local_resources(self):
22
+
23
+ if not self._is_initialized:
24
+ torch.set_num_threads(1)
25
+
26
+ bi_encoder = SentenceTransformer(self.embedding_model)
27
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
28
+
29
+ self.router = SemanticSectionRouter(profile=self.profile, model=bi_encoder)
30
+ self.ranker = HybridRanker(
31
+ profile=self.profile,
32
+ model=bi_encoder,
33
+ cross_encoder_model=cross_encoder
34
+ )
35
+ self._is_initialized = True
36
+
37
+ def _process_batch(self, batch: dict, threshold: float) -> dict:
38
+ self._lazy_init_local_resources()
39
+
40
+ cleaned_documents = []
41
+ for raw_text in batch["raw_text"]:
42
+ if not raw_text or not isinstance(raw_text, str):
43
+ cleaned_documents.append("")
44
+ continue
45
+
46
+ sentences = clean_and_split_chunks(raw_text)
47
+ buckets = self.router.route_sentences(sentences)
48
+ print(buckets)
49
+
50
+ target_sentences = []
51
+ for section_key in self.router.profile.target_sections:
52
+ target_sentences.extend(buckets.get(section_key, []))
53
+
54
+ if not target_sentences:
55
+ for sec_content in buckets.values():
56
+ target_sentences.extend(sec_content)
57
+
58
+ filtered_tasks = self.ranker.rank_and_filter(target_sentences, final_threshold=threshold)
59
+ pp_outputs = sanitize_outputs(filtered_tasks)
60
+ cleaned_documents.append(pp_outputs)
61
+
62
+ return {"cleaned_text": cleaned_documents}
63
+
64
+ def process(self, texts: list[str], num_workers: int = None, threshold: float = 0.50) -> list[str]:
65
+ if num_workers is None:
66
+ num_workers = max(1, os.cpu_count() - 1)
67
+
68
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
69
+
70
+ print(f"Creating Dataset for {len(texts)} documents...")
71
+ dataset = Dataset.from_dict({"raw_text": texts})
72
+
73
+ print(f"Running pipeline using {num_workers} parallel worker(s)...")
74
+
75
+ processed_dataset = dataset.map(
76
+ self._process_batch,
77
+ batched=True,
78
+ batch_size=self.batch_size,
79
+ num_proc=num_workers,
80
+ fn_kwargs={
81
+ "threshold": threshold
82
+ },
83
+ desc="Filtering Documents"
84
+ )
85
+
86
+ return processed_dataset["cleaned_text"]
@@ -0,0 +1,64 @@
1
+ from dataclasses import dataclass, field, asdict
2
+ import json
3
+ from pathlib import Path
4
+
5
+ @dataclass
6
+ class DomainProfile:
7
+ name: str
8
+ global_stop_phrases: set[str] = field(default_factory=set)
9
+ anchors: dict[str, str] = field(default_factory=dict)
10
+ default_section: str = "CORE"
11
+ target_sections: list[str] = field(default_factory=list)
12
+ target_semantic_anchor: str = "core actions and focus areas"
13
+ action_keywords_regex: str = r"\b(do|execute|perform)\b"
14
+ syntax_start_pattern: str = r"^.*"
15
+
16
+ def validate(self):
17
+ if not self.name.isalnum():
18
+ raise ValueError("Profile name must be alphanumeric (no spaces or special characters).")
19
+ if not self.section_anchors:
20
+ raise ValueError("DomainProfile must define at least one section anchor.")
21
+ if self.default_section not in self.anchors:
22
+ raise ValueError(f"Default section '{self.default_section}' must be one of the defined section anchors.")
23
+ for section in self.target_sections:
24
+ if section not in self.anchors:
25
+ raise ValueError(f"Target section '{section}' must match a key in section_anchors.")
26
+
27
+ def save_to_json(self, target_dir: Path | str) -> Path:
28
+ self.validate()
29
+ dir_path = Path(target_dir)
30
+ dir_path.mkdir(parents=True, exist_ok=True)
31
+
32
+ file_path = dir_path / f"{self.name}.json"
33
+
34
+ data = asdict(self)
35
+ data['global_stop_phrases'] = list(data['global_stop_phrases'])
36
+
37
+ with open(file_path, 'w', encoding='utf-8') as f:
38
+ json.dump(data, f, indent=4)
39
+
40
+ return file_path
41
+
42
+ @classmethod
43
+ def load_from_json(cls, file_path: Path | str) -> 'DomainProfile':
44
+ with open(file_path, 'r', encoding='utf-8') as f:
45
+ data = json.load(f)
46
+
47
+ data['global_stop_phrases'] = set(data['global_stop_phrases'])
48
+ return cls(**data)
49
+
50
+
51
+ JOB_POSTING_PROFILE = DomainProfile(
52
+ name="job postings",
53
+ global_stop_phrases={"apply online", "click here", "equal opportunity employer", "visa sponsorship"},
54
+ anchors={
55
+ 'CORE': "core job duties, daily responsibilities, tasks, project assignments, what you will build and execute on the job",
56
+ 'REQUIREMENTS': "candidate qualifications, required experience, required skills, degrees, interview assessment process, hiring criteria",
57
+ 'BOILERPLATE': "company overview, history, mission statement, office locations, perks and benefits, scam alerts, recruitment partners, privacy policies"
58
+ },
59
+ default_section="CORE",
60
+ target_sections=['CORE', 'REQUIREMENTS'],
61
+ target_semantic_anchor="execute responsibilities, perform duties, manage projects, build and design systems",
62
+ action_keywords_regex=r"\b(write|design|build|develop|manage|lead|collaborate|implement|maintain)\b",
63
+ syntax_start_pattern=r"^(?:you\s+will\s+|responsibilities\s+include\s+)?",
64
+ )
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.4
2
+ Name: dstpr
3
+ Version: 0.1.0
4
+ Summary: A high-throughput, domain-specific text preprocessing cascading pipeline filter to rank core sentences in texts and filter away boilerplate.
5
+ Author-email: Stephen Meisenbacher <sjmeis@gtgd.com>
6
+ Maintainer-email: Stephen Meisenbacher <sjmeis@gtgd.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2026 Stephen Meisenbacher
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+ Classifier: Programming Language :: Python :: 3
29
+ Classifier: License :: OSI Approved :: MIT License
30
+ Classifier: Operating System :: OS Independent
31
+ Requires-Python: >=3.10
32
+ Description-Content-Type: text/markdown
33
+ License-File: LICENSE
34
+ Requires-Dist: sentence-transformers>=3.0.0
35
+ Requires-Dist: scikit-learn>=1.0.0
36
+ Requires-Dist: numpy>=1.22.0
37
+ Requires-Dist: datasets>=2.12.0
38
+ Requires-Dist: spacy<4.0.0,>=3.8.0
39
+ Dynamic: license-file
40
+
41
+ # DSTPR: Domain-specific Text Pre-processing and Ranking
42
+
43
+ [![PyPI version](https://img.shields.io/pypi/v/dstpr.svg)](https://pypi.org/project/dstpr/)
44
+ [![License](https://img.shields.io/github/license/sjmeis/DSTPR.svg)](https://github.com/sjmeis/DSTPR/blob/main/LICENSE)
45
+
46
+ A high-throughput text preprocessing pipeline designed to filter, segment, and rank core sentences from noisy, plaintext documents.
47
+
48
+ This package strips away boilerplate, disclaimers, and application instructions *before* moving this data onto to heavier processing pipelines.
49
+
50
+ ---
51
+
52
+ ## Pipeline Architecture
53
+
54
+ Instead of running heavy transformer models over every sentence in a document, `DSTPR` pipes documents through progressively stricter layers:
55
+
56
+ 1. **Heuristic Cleansing & Tokenization:** Uses `PySBD` (Python Sentence Boundary Disambiguation) paired with regular expressions to fix punctuation caused by flattening, then filters out sentences containing stop phrases.
57
+ 2. **Semantic Section Routing:** Utilizes a lightweight encoder (Default: `all-MiniLM-L6-v2`) to dynamically find structural transitions in input texts.
58
+ 3. **Hybrid Ranking:** Scores remaining sentences using a combination of **Semantics** (cosine similarity to anchors) and **Lexical Syntax** (regex-supported detection).
59
+ 4. **Parallel Execution Engine:** Wraps the entire pipeline inside efficient datasets to allow for batch processing at scale, with CPU or GPU.
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ To install the package, simply run
66
+
67
+ ```bash
68
+ pip install dstpr
69
+ ```
70
+
71
+ ## Usage
72
+
73
+ ### High-Throughput Batch Processing
74
+
75
+ This is the recommended approach for large-scale data pipelines.
76
+
77
+ ```python
78
+ from dstpr import ParallelPreprocessingPipeline, JOB_POSTING_PROFILE
79
+
80
+ pipeline = ParallelPreprocessingPipeline(profile=JOB_POSTING_PROFILE, batch_size=64)
81
+
82
+ raw_documents = [
83
+ "DOC 1",
84
+ "DOC 2",
85
+ "..."
86
+ ]
87
+
88
+ cleaned_documents = pipeline.process(raw_documents, num_workers=8, threshold=0.25)
89
+ ```
90
+
91
+ ### Advanced Usage
92
+
93
+ If you want to integrate specific pipeline layers directly into an existing workflow, or tweak the internal parameters, you can import individual modules manually:
94
+
95
+ ```python
96
+ from dstpr.cleaners import clean_and_split_chunks
97
+ from dstpr.segmenters import SemanticSectionRouter
98
+ from dstpr.rankers import HybridTaskRanker
99
+ from sentence_transformers import Transformer, SentenceTransformer
100
+
101
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
102
+
103
+ text = "Job posting text goes here!"
104
+
105
+ # Clean and segment sentences
106
+ sentences = clean_and_split_chunks(text)
107
+
108
+ # Route by context
109
+ router = SemanticSectionRouter(model=embedding_model)
110
+ buckets = router.route_sentences(sentences)
111
+ target_sentences = buckets['CORE'] + buckets['REQUIREMENTS']
112
+
113
+ # Grade and sort text features
114
+ ranker = HybridRanker(model=shared_model)
115
+ final = ranker.rank_and_filter(target_sentences, bi_cutoff_pct=0.25, final_threshold=0.40)
116
+ ```
117
+
118
+ ### Parameter Tuning
119
+
120
+ To adjust the trade-off between strict filtering and execution speed, consider tweaking these variables in `ParallelPreprocessingPipeline`:
121
+ - `threshold` (Default: `0.4`): Controls how aggressively sentences are discarded. Raising this value toward `0.6`, for example, ensures only stronger matching sentences are passed through. Lowering it toward `0.3`, on the other hand, acts as a wider net.
122
+ - `batch_size` (Default: `256`): This depends on your hardware! Adjust for best performance.
123
+
124
+ ## Creating Custom Domain Profiles (via the provided wizard)
125
+
126
+ A `DomainProfile` is required to use `DSTPR` (see the usage example). Out of the box, `dstpr` ships with pre-configured configurations for job postings (`JOB_POSTINGS_PROFILE`). However, you can easily generate an domain-specific pipeline for **any specific domain** using our built-in interactive configuration wizard.
127
+
128
+ To spin up the profile creation walkthrough, simply open your terminal and run:
129
+
130
+ ```bash
131
+ task-profile-wizard
132
+ ```
133
+
134
+ All profiles generated via the terminal wizard are automatically validated and written out as JSON to a local cache: `~/.config/dstpr/profiles/`
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ dstpr/__init__.py
5
+ dstpr/pipeline.py
6
+ dstpr/profiles.py
7
+ dstpr.egg-info/PKG-INFO
8
+ dstpr.egg-info/SOURCES.txt
9
+ dstpr.egg-info/dependency_links.txt
10
+ dstpr.egg-info/entry_points.txt
11
+ dstpr.egg-info/requires.txt
12
+ dstpr.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ profile-wizard = dstpr.utils.cli:run_profile_wizard
@@ -0,0 +1,5 @@
1
+ sentence-transformers>=3.0.0
2
+ scikit-learn>=1.0.0
3
+ numpy>=1.22.0
4
+ datasets>=2.12.0
5
+ spacy<4.0.0,>=3.8.0
@@ -0,0 +1 @@
1
+ dstpr
@@ -0,0 +1,36 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dstpr"
7
+ version = "0.1.0"
8
+ description = "A high-throughput, domain-specific text preprocessing cascading pipeline filter to rank core sentences in texts and filter away boilerplate."
9
+ readme = {file = "README.md", content-type = "text/markdown"}
10
+ license = {file = "LICENSE"}
11
+ authors = [
12
+ {name = "Stephen Meisenbacher", email = "sjmeis@gtgd.com"},
13
+ ]
14
+ maintainers = [
15
+ {name = "Stephen Meisenbacher", email = "sjmeis@gtgd.com"}
16
+ ]
17
+ requires-python = ">=3.10"
18
+ classifiers = [
19
+ "Programming Language :: Python :: 3",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Operating System :: OS Independent"
22
+ ]
23
+ dependencies = [
24
+ "sentence-transformers>=3.0.0",
25
+ "scikit-learn>=1.0.0",
26
+ "numpy>=1.22.0",
27
+ "datasets>=2.12.0",
28
+ "spacy>=3.8.0,<4.0.0"
29
+ ]
30
+
31
+ [tool.setuptools.packages.find]
32
+ where = ["."]
33
+ include = ["dstpr"]
34
+
35
+ [project.scripts]
36
+ profile-wizard = "dstpr.utils.cli:run_profile_wizard"
dstpr-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+