structflo-ner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ check:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - name: Install uv
16
+ uses: astral-sh/setup-uv@v4
17
+
18
+ - name: Set up Python
19
+ run: uv python install 3.12
20
+
21
+ - name: Install dependencies
22
+ run: uv sync --dev
23
+
24
+ - name: Lint
25
+ run: uv run ruff check structflo/ tests/
26
+
27
+ - name: Format
28
+ run: uv run ruff format --check structflo/ tests/
29
+
30
+ - name: Test
31
+ run: uv run pytest -q
@@ -0,0 +1,35 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ id-token: write # trusted publishing
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@v4
18
+
19
+ - name: Set up Python
20
+ run: uv python install 3.12
21
+
22
+ - name: Install dependencies
23
+ run: uv sync --dev
24
+
25
+ - name: Lint
26
+ run: uv run ruff check structflo/ tests/
27
+
28
+ - name: Test
29
+ run: uv run pytest -q
30
+
31
+ - name: Build
32
+ run: uv build
33
+
34
+ - name: Publish to PyPI
35
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,34 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+
7
+ # Distribution / packaging
8
+ dist/
9
+ build/
10
+ *.egg-info/
11
+ *.egg
12
+
13
+ # Virtual environments
14
+ .venv/
15
+ venv/
16
+
17
+ # Testing
18
+ .pytest_cache/
19
+ .coverage
20
+ htmlcov/
21
+
22
+ # IDEs
23
+ .vscode/
24
+ .idea/
25
+ *.swp
26
+ *.swo
27
+
28
+ # OS
29
+ .DS_Store
30
+ Thumbs.db
31
+
32
+ # Environment variables
33
+ .env
34
+ .env.*
@@ -0,0 +1,35 @@
1
+ .PHONY: install lint format check test clean build
2
+
3
+ ## Install all dependencies (including dev)
4
+ install:
5
+ uv sync --dev
6
+
7
+ ## Run ruff linter
8
+ lint:
9
+ uv run ruff check structflo/ tests/
10
+
11
+ ## Auto-fix lint issues
12
+ fix:
13
+ uv run ruff check --fix structflo/ tests/
14
+
15
+ ## Format code with ruff
16
+ format:
17
+ uv run ruff format structflo/ tests/
18
+
19
+ ## Check formatting + lint (CI-friendly, no changes)
20
+ check:
21
+ uv run ruff format --check structflo/ tests/
22
+ uv run ruff check structflo/ tests/
23
+
24
+ ## Run tests
25
+ test:
26
+ uv run pytest -q
27
+
28
+ ## Remove build artifacts
29
+ clean:
30
+ rm -rf dist/ build/ *.egg-info .pytest_cache
31
+ find . -type d -name __pycache__ -exec rm -rf {} +
32
+
33
+ ## Build package
34
+ build:
35
+ uv build
@@ -0,0 +1,113 @@
1
+ Metadata-Version: 2.4
2
+ Name: structflo-ner
3
+ Version: 0.1.0
4
+ Summary: Drug discovery NER wrapper around LangExtract — zero-config entity extraction for chemistry and biology.
5
+ License: Apache-2.0
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: langextract>=1.1.1
8
+ Provides-Extra: dataframe
9
+ Requires-Dist: pandas>=1.5; extra == 'dataframe'
10
+ Description-Content-Type: text/markdown
11
+
12
+ # structflo.ner
13
+
14
+ Drug discovery NER powered by [LangExtract](https://github.com/google/langextract).
15
+
16
+ Extract compounds, targets, bioactivity data, diseases, and more from scientific text — zero configuration required.
17
+
18
+ ## Install
19
+
20
+ ```bash
21
+ pip install structflo-ner
22
+ # or with uv
23
+ uv add structflo-ner
24
+
25
+ # optional pandas support
26
+ pip install "structflo-ner[dataframe]"
27
+ ```
28
+
29
+ ## Quick start
30
+
31
+ ```python
32
+ from structflo.ner import NERExtractor
33
+
34
+ extractor = NERExtractor(api_key="YOUR_GEMINI_KEY")
35
+ result = extractor.extract(
36
+ "Gefitinib (ZD1839) is a first-generation EGFR inhibitor with IC50 = 0.033 µM approved for NSCLC."
37
+ )
38
+
39
+ print(result.compounds) # [ChemicalEntity(text='Gefitinib', ...)]
40
+ print(result.targets) # [TargetEntity(text='EGFR', ...)]
41
+ print(result.bioactivities) # [BioactivityEntity(text='IC50 = 0.033 µM', ...)]
42
+ print(result.diseases) # [DiseaseEntity(text='NSCLC', ...)]
43
+
44
+ df = result.to_dataframe() # flat pandas DataFrame
45
+ ```
46
+
47
+ ## Local models via Ollama
48
+
49
+ Run extraction entirely on your own hardware — no API key needed:
50
+
51
+ ```python
52
+ extractor = NERExtractor(
53
+ model_id="gemma3:27b",
54
+ model_url="http://localhost:11434",
55
+ )
56
+ result = extractor.extract("Sorafenib inhibits VEGFR-2 and RAF kinases.")
57
+ ```
58
+
59
+ Any model served by Ollama works (gemma, llama, mistral, qwen, deepseek, etc.).
60
+
61
+ ## Built-in profiles
62
+
63
+ | Profile | Entity classes |
64
+ |---|---|
65
+ | `FULL` (default) | compounds, targets, diseases, bioactivities, assays, mechanisms |
66
+ | `CHEMISTRY` | compound names, SMILES, CAS numbers, molecular formulas |
67
+ | `BIOLOGY` | targets, gene names, protein names |
68
+ | `BIOACTIVITY` | bioactivity measurements, assays |
69
+ | `DISEASE` | diseases and clinical indications |
70
+
71
+ ```python
72
+ from structflo.ner import NERExtractor, CHEMISTRY
73
+
74
+ extractor = NERExtractor(api_key="YOUR_GEMINI_KEY")
75
+ result = extractor.extract(text, profile=CHEMISTRY)
76
+ ```
77
+
78
+ Profiles can be merged:
79
+
80
+ ```python
81
+ from structflo.ner import CHEMISTRY, BIOLOGY
82
+
83
+ combined = CHEMISTRY.merge(BIOLOGY)
84
+ result = extractor.extract(text, profile=combined)
85
+ ```
86
+
87
+ ## Custom profiles
88
+
89
+ ```python
90
+ from structflo.ner import NERExtractor, EntityProfile
91
+
92
+ my_profile = EntityProfile(
93
+ name="kinase_inhibitors",
94
+ entity_classes=["compound_name", "smiles", "target", "bioactivity"],
95
+ prompt="Extract kinase inhibitor names, SMILES, targets, and potency values.",
96
+ examples=my_examples,
97
+ )
98
+ result = extractor.extract(text, profile=my_profile)
99
+ ```
100
+
101
+ ## Working with results
102
+
103
+ ```python
104
+ result.all_entities() # flat list of every entity
105
+ result.to_dict() # plain dictionary
106
+ result.to_dataframe() # pandas DataFrame (requires structflo-ner[dataframe])
107
+ ```
108
+
109
+ ## Notebooks
110
+
111
+ See the [notebooks/](notebooks/) directory for worked examples:
112
+
113
+ - **01_quickstart.ipynb** — end-to-end extraction with cloud and local models
@@ -0,0 +1,102 @@
1
+ # structflo.ner
2
+
3
+ Drug discovery NER powered by [LangExtract](https://github.com/google/langextract).
4
+
5
+ Extract compounds, targets, bioactivity data, diseases, and more from scientific text — zero configuration required.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install structflo-ner
11
+ # or with uv
12
+ uv add structflo-ner
13
+
14
+ # optional pandas support
15
+ pip install "structflo-ner[dataframe]"
16
+ ```
17
+
18
+ ## Quick start
19
+
20
+ ```python
21
+ from structflo.ner import NERExtractor
22
+
23
+ extractor = NERExtractor(api_key="YOUR_GEMINI_KEY")
24
+ result = extractor.extract(
25
+ "Gefitinib (ZD1839) is a first-generation EGFR inhibitor with IC50 = 0.033 µM approved for NSCLC."
26
+ )
27
+
28
+ print(result.compounds) # [ChemicalEntity(text='Gefitinib', ...)]
29
+ print(result.targets) # [TargetEntity(text='EGFR', ...)]
30
+ print(result.bioactivities) # [BioactivityEntity(text='IC50 = 0.033 µM', ...)]
31
+ print(result.diseases) # [DiseaseEntity(text='NSCLC', ...)]
32
+
33
+ df = result.to_dataframe() # flat pandas DataFrame
34
+ ```
35
+
36
+ ## Local models via Ollama
37
+
38
+ Run extraction entirely on your own hardware — no API key needed:
39
+
40
+ ```python
41
+ extractor = NERExtractor(
42
+ model_id="gemma3:27b",
43
+ model_url="http://localhost:11434",
44
+ )
45
+ result = extractor.extract("Sorafenib inhibits VEGFR-2 and RAF kinases.")
46
+ ```
47
+
48
+ Any model served by Ollama works (gemma, llama, mistral, qwen, deepseek, etc.).
49
+
50
+ ## Built-in profiles
51
+
52
+ | Profile | Entity classes |
53
+ |---|---|
54
+ | `FULL` (default) | compounds, targets, diseases, bioactivities, assays, mechanisms |
55
+ | `CHEMISTRY` | compound names, SMILES, CAS numbers, molecular formulas |
56
+ | `BIOLOGY` | targets, gene names, protein names |
57
+ | `BIOACTIVITY` | bioactivity measurements, assays |
58
+ | `DISEASE` | diseases and clinical indications |
59
+
60
+ ```python
61
+ from structflo.ner import NERExtractor, CHEMISTRY
62
+
63
+ extractor = NERExtractor(api_key="YOUR_GEMINI_KEY")
64
+ result = extractor.extract(text, profile=CHEMISTRY)
65
+ ```
66
+
67
+ Profiles can be merged:
68
+
69
+ ```python
70
+ from structflo.ner import CHEMISTRY, BIOLOGY
71
+
72
+ combined = CHEMISTRY.merge(BIOLOGY)
73
+ result = extractor.extract(text, profile=combined)
74
+ ```
75
+
76
+ ## Custom profiles
77
+
78
+ ```python
79
+ from structflo.ner import NERExtractor, EntityProfile
80
+
81
+ my_profile = EntityProfile(
82
+ name="kinase_inhibitors",
83
+ entity_classes=["compound_name", "smiles", "target", "bioactivity"],
84
+ prompt="Extract kinase inhibitor names, SMILES, targets, and potency values.",
85
+ examples=my_examples,
86
+ )
87
+ result = extractor.extract(text, profile=my_profile)
88
+ ```
89
+
90
+ ## Working with results
91
+
92
+ ```python
93
+ result.all_entities() # flat list of every entity
94
+ result.to_dict() # plain dictionary
95
+ result.to_dataframe() # pandas DataFrame (requires structflo-ner[dataframe])
96
+ ```
97
+
98
+ ## Notebooks
99
+
100
+ See the [notebooks/](notebooks/) directory for worked examples:
101
+
102
+ - **01_quickstart.ipynb** — end-to-end extraction with cloud and local models
@@ -0,0 +1,262 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# structflo.ner — Quickstart\n",
8
+ "\n",
9
+ "This notebook walks through the core features of `structflo.ner`:\n",
10
+ "\n",
11
+ "1. Basic extraction with a cloud model (Gemini)\n",
12
+ "2. Local extraction with Ollama\n",
13
+ "3. Using built-in profiles\n",
14
+ "4. Custom profiles\n",
15
+ "5. Working with results"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "markdown",
20
+ "metadata": {},
21
+ "source": [
22
+ "## Setup\n",
23
+ "\n",
24
+ "```bash\n",
25
+ "uv add structflo-ner\n",
26
+ "# for DataFrame support\n",
27
+ "uv add \"structflo-ner[dataframe]\"\n",
28
+ "```"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "from structflo.ner import NERExtractor, FULL, CHEMISTRY, BIOLOGY, BIOACTIVITY, DISEASE"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "markdown",
42
+ "metadata": {},
43
+ "source": [
44
+ "## 1. Cloud model (Gemini)\n",
45
+ "\n",
46
+ "The default model is `gemini-2.5-flash`. Pass your API key or set the `GEMINI_API_KEY` environment variable."
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "extractor = NERExtractor(api_key=\"YOUR_GEMINI_KEY\") # or set GEMINI_API_KEY env var\n",
56
+ "\n",
57
+ "text = (\n",
58
+ " \"Gefitinib (ZD1839) is a first-generation EGFR tyrosine kinase inhibitor \"\n",
59
+ " \"with IC50 = 0.033 µM, approved for non-small cell lung cancer (NSCLC). \"\n",
60
+ " \"Its SMILES is COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1OCCCN1CCOCC1.\"\n",
61
+ ")\n",
62
+ "\n",
63
+ "result = extractor.extract(text)\n",
64
+ "result"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "markdown",
69
+ "metadata": {},
70
+ "source": [
71
+ "## 2. Local model via Ollama\n",
72
+ "\n",
73
+ "Run extraction on your own hardware — no API key needed.\n",
74
+ "\n",
75
+ "Make sure Ollama is running locally:\n",
76
+ "```bash\n",
77
+ "ollama serve\n",
78
+ "ollama pull gemma3:27b\n",
79
+ "```"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": null,
85
+ "metadata": {},
86
+ "outputs": [],
87
+ "source": [
88
+ "local_extractor = NERExtractor(\n",
89
+ " model_id=\"gemma3:27b\",\n",
90
+ " model_url=\"http://localhost:11434\",\n",
91
+ ")\n",
92
+ "\n",
93
+ "result_local = local_extractor.extract(\n",
94
+ " \"Sorafenib is a multi-kinase inhibitor targeting VEGFR-2, PDGFR, and RAF with IC50 values of 90 nM, 57 nM, and 6 nM respectively.\"\n",
95
+ ")\n",
96
+ "result_local"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "markdown",
101
+ "metadata": {},
102
+ "source": [
103
+ "## 3. Built-in profiles\n",
104
+ "\n",
105
+ "Profiles control which entity types are extracted. Use them to focus the model on specific categories."
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": null,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "# Extract only chemical entities\n",
115
+ "chem_result = extractor.extract(text, profile=CHEMISTRY)\n",
116
+ "print(\"Compounds:\", chem_result.compounds)\n",
117
+ "print(\"Targets:\", chem_result.targets) # empty — not in CHEMISTRY profile"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": null,
123
+ "metadata": {},
124
+ "outputs": [],
125
+ "source": [
126
+ "# Merge profiles to combine entity types\n",
127
+ "combined = CHEMISTRY.merge(BIOLOGY)\n",
128
+ "print(f\"Profile: {combined.name}\")\n",
129
+ "print(f\"Entity classes: {combined.entity_classes}\")\n",
130
+ "\n",
131
+ "combined_result = extractor.extract(text, profile=combined)\n",
132
+ "print(\"Compounds:\", combined_result.compounds)\n",
133
+ "print(\"Targets:\", combined_result.targets)"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "markdown",
138
+ "metadata": {},
139
+ "source": [
140
+ "## 4. Custom profiles\n",
141
+ "\n",
142
+ "Define your own extraction profiles for domain-specific use cases."
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": null,
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "from structflo.ner import EntityProfile\n",
152
+ "\n",
153
+ "kinase_profile = EntityProfile(\n",
154
+ " name=\"kinase_inhibitors\",\n",
155
+ " entity_classes=[\"compound_name\", \"smiles\", \"target\", \"bioactivity\"],\n",
156
+ " prompt=\"Extract kinase inhibitor names, their SMILES strings, kinase targets, and potency values (IC50, Ki, Kd).\",\n",
157
+ " examples=[], # add your own few-shot examples here for best results\n",
158
+ ")\n",
159
+ "\n",
160
+ "kinase_result = extractor.extract(text, profile=kinase_profile)\n",
161
+ "kinase_result"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "markdown",
166
+ "metadata": {},
167
+ "source": [
168
+ "## 5. Working with results"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": null,
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "# Access typed entity lists\n",
178
+ "print(\"Compounds:\", result.compounds)\n",
179
+ "print(\"Targets:\", result.targets)\n",
180
+ "print(\"Bioactivities:\", result.bioactivities)\n",
181
+ "print(\"Diseases:\", result.diseases)\n",
182
+ "print(\"Mechanisms:\", result.mechanisms)"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "# Flat list of all entities\n",
192
+ "for entity in result.all_entities():\n",
193
+ " print(f\"{entity.entity_type:20s} | {entity.text}\")"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": null,
199
+ "metadata": {},
200
+ "outputs": [],
201
+ "source": [
202
+ "# Export to pandas DataFrame\n",
203
+ "df = result.to_dataframe()\n",
204
+ "df"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "# Serialize to dict (useful for JSON export)\n",
214
+ "import json\n",
215
+ "\n",
216
+ "print(json.dumps(result.to_dict(), indent=2))"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "markdown",
221
+ "metadata": {},
222
+ "source": [
223
+ "## 6. Batch extraction\n",
224
+ "\n",
225
+ "Pass a list of texts to extract from multiple documents."
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "texts = [\n",
235
+ " \"Imatinib inhibits BCR-ABL with IC50 = 0.6 µM in CML.\",\n",
236
+ " \"Trastuzumab targets HER2 in breast cancer patients.\",\n",
237
+ " \"Remdesivir (GS-5734) is an antiviral with EC50 = 0.77 µM against SARS-CoV-2.\",\n",
238
+ "]\n",
239
+ "\n",
240
+ "results = extractor.extract(texts)\n",
241
+ "\n",
242
+ "for i, r in enumerate(results):\n",
243
+ " print(f\"\\n--- Text {i+1} ---\")\n",
244
+ " for entity in r.all_entities():\n",
245
+ " print(f\" {entity.entity_type:20s} | {entity.text}\")"
246
+ ]
247
+ }
248
+ ],
249
+ "metadata": {
250
+ "kernelspec": {
251
+ "display_name": "Python 3",
252
+ "language": "python",
253
+ "name": "python3"
254
+ },
255
+ "language_info": {
256
+ "name": "python",
257
+ "version": "3.12.0"
258
+ }
259
+ },
260
+ "nbformat": 4,
261
+ "nbformat_minor": 4
262
+ }
@@ -0,0 +1,60 @@
1
+ [project]
2
+ name = "structflo-ner"
3
+ dynamic = ["version"]
4
+ description = "Drug discovery NER wrapper around LangExtract — zero-config entity extraction for chemistry and biology."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = { text = "Apache-2.0" }
8
+ dependencies = [
9
+ "langextract>=1.1.1",
10
+ ]
11
+
12
+ [project.optional-dependencies]
13
+ dataframe = ["pandas>=1.5"]
14
+
15
+ [dependency-groups]
16
+ dev = [
17
+ "pytest>=7",
18
+ "pytest-mock>=3",
19
+ "ruff>=0.8",
20
+ ]
21
+
22
+ [build-system]
23
+ requires = ["hatchling"]
24
+ build-backend = "hatchling.build"
25
+
26
+ [tool.hatch.version]
27
+ path = "structflo/ner/__init__.py"
28
+
29
+ [tool.hatch.build.targets.wheel]
30
+ packages = ["structflo"]
31
+
32
+ [tool.pytest.ini_options]
33
+ testpaths = ["tests"]
34
+
35
+ # --------------------------------------------------------------------------- #
36
+ # Ruff
37
+ # --------------------------------------------------------------------------- #
38
+ [tool.ruff]
39
+ target-version = "py310"
40
+ line-length = 100
41
+
42
+ [tool.ruff.lint]
43
+ select = [
44
+ "E", # pycodestyle errors
45
+ "W", # pycodestyle warnings
46
+ "F", # pyflakes
47
+ "I", # isort
48
+ "UP", # pyupgrade
49
+ "B", # flake8-bugbear
50
+ "SIM", # flake8-simplify
51
+ ]
52
+ ignore = [
53
+ "E501", # line length handled by formatter
54
+ ]
55
+
56
+ [tool.ruff.lint.isort]
57
+ known-first-party = ["structflo"]
58
+
59
+ [tool.ruff.format]
60
+ quote-style = "double"