structflo-ner 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structflo_ner-0.1.0/.github/workflows/ci.yml +31 -0
- structflo_ner-0.1.0/.github/workflows/publish.yml +35 -0
- structflo_ner-0.1.0/.gitignore +34 -0
- structflo_ner-0.1.0/Makefile +35 -0
- structflo_ner-0.1.0/PKG-INFO +113 -0
- structflo_ner-0.1.0/README.md +102 -0
- structflo_ner-0.1.0/notebooks/01_quickstart.ipynb +262 -0
- structflo_ner-0.1.0/pyproject.toml +60 -0
- structflo_ner-0.1.0/structflo/ner/__init__.py +77 -0
- structflo_ner-0.1.0/structflo/ner/_entities.py +153 -0
- structflo_ner-0.1.0/structflo/ner/_examples.py +361 -0
- structflo_ner-0.1.0/structflo/ner/_mapping.py +71 -0
- structflo_ner-0.1.0/structflo/ner/_prompts.py +49 -0
- structflo_ner-0.1.0/structflo/ner/extractor.py +141 -0
- structflo_ner-0.1.0/structflo/ner/profiles.py +105 -0
- structflo_ner-0.1.0/tests/__init__.py +0 -0
- structflo_ner-0.1.0/tests/test_entities.py +141 -0
- structflo_ner-0.1.0/tests/test_extractor.py +147 -0
- structflo_ner-0.1.0/uv.lock +2257 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
check:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Install uv
|
|
16
|
+
uses: astral-sh/setup-uv@v4
|
|
17
|
+
|
|
18
|
+
- name: Set up Python
|
|
19
|
+
run: uv python install 3.12
|
|
20
|
+
|
|
21
|
+
- name: Install dependencies
|
|
22
|
+
run: uv sync --dev
|
|
23
|
+
|
|
24
|
+
- name: Lint
|
|
25
|
+
run: uv run ruff check structflo/ tests/
|
|
26
|
+
|
|
27
|
+
- name: Format
|
|
28
|
+
run: uv run ruff format --check structflo/ tests/
|
|
29
|
+
|
|
30
|
+
- name: Test
|
|
31
|
+
run: uv run pytest -q
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write # trusted publishing
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Install uv
|
|
17
|
+
uses: astral-sh/setup-uv@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
run: uv python install 3.12
|
|
21
|
+
|
|
22
|
+
- name: Install dependencies
|
|
23
|
+
run: uv sync --dev
|
|
24
|
+
|
|
25
|
+
- name: Lint
|
|
26
|
+
run: uv run ruff check structflo/ tests/
|
|
27
|
+
|
|
28
|
+
- name: Test
|
|
29
|
+
run: uv run pytest -q
|
|
30
|
+
|
|
31
|
+
- name: Build
|
|
32
|
+
run: uv build
|
|
33
|
+
|
|
34
|
+
- name: Publish to PyPI
|
|
35
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
*.egg-info/
|
|
11
|
+
*.egg
|
|
12
|
+
|
|
13
|
+
# Virtual environments
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
|
|
17
|
+
# Testing
|
|
18
|
+
.pytest_cache/
|
|
19
|
+
.coverage
|
|
20
|
+
htmlcov/
|
|
21
|
+
|
|
22
|
+
# IDEs
|
|
23
|
+
.vscode/
|
|
24
|
+
.idea/
|
|
25
|
+
*.swp
|
|
26
|
+
*.swo
|
|
27
|
+
|
|
28
|
+
# OS
|
|
29
|
+
.DS_Store
|
|
30
|
+
Thumbs.db
|
|
31
|
+
|
|
32
|
+
# Environment variables
|
|
33
|
+
.env
|
|
34
|
+
.env.*
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
.PHONY: install lint format check test clean build
|
|
2
|
+
|
|
3
|
+
## Install all dependencies (including dev)
|
|
4
|
+
install:
|
|
5
|
+
uv sync --dev
|
|
6
|
+
|
|
7
|
+
## Run ruff linter
|
|
8
|
+
lint:
|
|
9
|
+
uv run ruff check structflo/ tests/
|
|
10
|
+
|
|
11
|
+
## Auto-fix lint issues
|
|
12
|
+
fix:
|
|
13
|
+
uv run ruff check --fix structflo/ tests/
|
|
14
|
+
|
|
15
|
+
## Format code with ruff
|
|
16
|
+
format:
|
|
17
|
+
uv run ruff format structflo/ tests/
|
|
18
|
+
|
|
19
|
+
## Check formatting + lint (CI-friendly, no changes)
|
|
20
|
+
check:
|
|
21
|
+
uv run ruff format --check structflo/ tests/
|
|
22
|
+
uv run ruff check structflo/ tests/
|
|
23
|
+
|
|
24
|
+
## Run tests
|
|
25
|
+
test:
|
|
26
|
+
uv run pytest -q
|
|
27
|
+
|
|
28
|
+
## Remove build artifacts
|
|
29
|
+
clean:
|
|
30
|
+
rm -rf dist/ build/ *.egg-info .pytest_cache
|
|
31
|
+
find . -type d -name __pycache__ -exec rm -rf {} +
|
|
32
|
+
|
|
33
|
+
## Build package
|
|
34
|
+
build:
|
|
35
|
+
uv build
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: structflo-ner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Drug discovery NER wrapper around LangExtract — zero-config entity extraction for chemistry and biology.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: langextract>=1.1.1
|
|
8
|
+
Provides-Extra: dataframe
|
|
9
|
+
Requires-Dist: pandas>=1.5; extra == 'dataframe'
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# structflo.ner
|
|
13
|
+
|
|
14
|
+
Drug discovery NER powered by [LangExtract](https://github.com/google/langextract).
|
|
15
|
+
|
|
16
|
+
Extract compounds, targets, bioactivity data, diseases, and more from scientific text — zero configuration required.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install structflo-ner
|
|
22
|
+
# or with uv
|
|
23
|
+
uv add structflo-ner
|
|
24
|
+
|
|
25
|
+
# optional pandas support
|
|
26
|
+
pip install "structflo-ner[dataframe]"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick start
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from structflo.ner import NERExtractor
|
|
33
|
+
|
|
34
|
+
extractor = NERExtractor(api_key="YOUR_GEMINI_KEY")
|
|
35
|
+
result = extractor.extract(
|
|
36
|
+
"Gefitinib (ZD1839) is a first-generation EGFR inhibitor with IC50 = 0.033 µM approved for NSCLC."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
print(result.compounds) # [ChemicalEntity(text='Gefitinib', ...)]
|
|
40
|
+
print(result.targets) # [TargetEntity(text='EGFR', ...)]
|
|
41
|
+
print(result.bioactivities) # [BioactivityEntity(text='IC50 = 0.033 µM', ...)]
|
|
42
|
+
print(result.diseases) # [DiseaseEntity(text='NSCLC', ...)]
|
|
43
|
+
|
|
44
|
+
df = result.to_dataframe() # flat pandas DataFrame
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Local models via Ollama
|
|
48
|
+
|
|
49
|
+
Run extraction entirely on your own hardware — no API key needed:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
extractor = NERExtractor(
|
|
53
|
+
model_id="gemma3:27b",
|
|
54
|
+
model_url="http://localhost:11434",
|
|
55
|
+
)
|
|
56
|
+
result = extractor.extract("Sorafenib inhibits VEGFR-2 and RAF kinases.")
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Any model served by Ollama works (gemma, llama, mistral, qwen, deepseek, etc.).
|
|
60
|
+
|
|
61
|
+
## Built-in profiles
|
|
62
|
+
|
|
63
|
+
| Profile | Entity classes |
|
|
64
|
+
|---|---|
|
|
65
|
+
| `FULL` (default) | compounds, targets, diseases, bioactivities, assays, mechanisms |
|
|
66
|
+
| `CHEMISTRY` | compound names, SMILES, CAS numbers, molecular formulas |
|
|
67
|
+
| `BIOLOGY` | targets, gene names, protein names |
|
|
68
|
+
| `BIOACTIVITY` | bioactivity measurements, assays |
|
|
69
|
+
| `DISEASE` | diseases and clinical indications |
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from structflo.ner import NERExtractor, CHEMISTRY
|
|
73
|
+
|
|
74
|
+
extractor = NERExtractor(api_key="YOUR_GEMINI_KEY")
|
|
75
|
+
result = extractor.extract(text, profile=CHEMISTRY)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Profiles can be merged:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from structflo.ner import CHEMISTRY, BIOLOGY
|
|
82
|
+
|
|
83
|
+
combined = CHEMISTRY.merge(BIOLOGY)
|
|
84
|
+
result = extractor.extract(text, profile=combined)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Custom profiles
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from structflo.ner import NERExtractor, EntityProfile
|
|
91
|
+
|
|
92
|
+
my_profile = EntityProfile(
|
|
93
|
+
name="kinase_inhibitors",
|
|
94
|
+
entity_classes=["compound_name", "smiles", "target", "bioactivity"],
|
|
95
|
+
prompt="Extract kinase inhibitor names, SMILES, targets, and potency values.",
|
|
96
|
+
examples=my_examples,
|
|
97
|
+
)
|
|
98
|
+
result = extractor.extract(text, profile=my_profile)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Working with results
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
result.all_entities() # flat list of every entity
|
|
105
|
+
result.to_dict() # plain dictionary
|
|
106
|
+
result.to_dataframe() # pandas DataFrame (requires structflo-ner[dataframe])
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Notebooks
|
|
110
|
+
|
|
111
|
+
See the [notebooks/](notebooks/) directory for worked examples:
|
|
112
|
+
|
|
113
|
+
- **01_quickstart.ipynb** — end-to-end extraction with cloud and local models
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# structflo.ner
|
|
2
|
+
|
|
3
|
+
Drug discovery NER powered by [LangExtract](https://github.com/google/langextract).
|
|
4
|
+
|
|
5
|
+
Extract compounds, targets, bioactivity data, diseases, and more from scientific text — zero configuration required.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install structflo-ner
|
|
11
|
+
# or with uv
|
|
12
|
+
uv add structflo-ner
|
|
13
|
+
|
|
14
|
+
# optional pandas support
|
|
15
|
+
pip install "structflo-ner[dataframe]"
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Quick start
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from structflo.ner import NERExtractor
|
|
22
|
+
|
|
23
|
+
extractor = NERExtractor(api_key="YOUR_GEMINI_KEY")
|
|
24
|
+
result = extractor.extract(
|
|
25
|
+
"Gefitinib (ZD1839) is a first-generation EGFR inhibitor with IC50 = 0.033 µM approved for NSCLC."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
print(result.compounds) # [ChemicalEntity(text='Gefitinib', ...)]
|
|
29
|
+
print(result.targets) # [TargetEntity(text='EGFR', ...)]
|
|
30
|
+
print(result.bioactivities) # [BioactivityEntity(text='IC50 = 0.033 µM', ...)]
|
|
31
|
+
print(result.diseases) # [DiseaseEntity(text='NSCLC', ...)]
|
|
32
|
+
|
|
33
|
+
df = result.to_dataframe() # flat pandas DataFrame
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Local models via Ollama
|
|
37
|
+
|
|
38
|
+
Run extraction entirely on your own hardware — no API key needed:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
extractor = NERExtractor(
|
|
42
|
+
model_id="gemma3:27b",
|
|
43
|
+
model_url="http://localhost:11434",
|
|
44
|
+
)
|
|
45
|
+
result = extractor.extract("Sorafenib inhibits VEGFR-2 and RAF kinases.")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Any model served by Ollama works (gemma, llama, mistral, qwen, deepseek, etc.).
|
|
49
|
+
|
|
50
|
+
## Built-in profiles
|
|
51
|
+
|
|
52
|
+
| Profile | Entity classes |
|
|
53
|
+
|---|---|
|
|
54
|
+
| `FULL` (default) | compounds, targets, diseases, bioactivities, assays, mechanisms |
|
|
55
|
+
| `CHEMISTRY` | compound names, SMILES, CAS numbers, molecular formulas |
|
|
56
|
+
| `BIOLOGY` | targets, gene names, protein names |
|
|
57
|
+
| `BIOACTIVITY` | bioactivity measurements, assays |
|
|
58
|
+
| `DISEASE` | diseases and clinical indications |
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from structflo.ner import NERExtractor, CHEMISTRY
|
|
62
|
+
|
|
63
|
+
extractor = NERExtractor(api_key="YOUR_GEMINI_KEY")
|
|
64
|
+
result = extractor.extract(text, profile=CHEMISTRY)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Profiles can be merged:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from structflo.ner import CHEMISTRY, BIOLOGY
|
|
71
|
+
|
|
72
|
+
combined = CHEMISTRY.merge(BIOLOGY)
|
|
73
|
+
result = extractor.extract(text, profile=combined)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Custom profiles
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from structflo.ner import NERExtractor, EntityProfile
|
|
80
|
+
|
|
81
|
+
my_profile = EntityProfile(
|
|
82
|
+
name="kinase_inhibitors",
|
|
83
|
+
entity_classes=["compound_name", "smiles", "target", "bioactivity"],
|
|
84
|
+
prompt="Extract kinase inhibitor names, SMILES, targets, and potency values.",
|
|
85
|
+
examples=my_examples,
|
|
86
|
+
)
|
|
87
|
+
result = extractor.extract(text, profile=my_profile)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Working with results
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
result.all_entities() # flat list of every entity
|
|
94
|
+
result.to_dict() # plain dictionary
|
|
95
|
+
result.to_dataframe() # pandas DataFrame (requires structflo-ner[dataframe])
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Notebooks
|
|
99
|
+
|
|
100
|
+
See the [notebooks/](notebooks/) directory for worked examples:
|
|
101
|
+
|
|
102
|
+
- **01_quickstart.ipynb** — end-to-end extraction with cloud and local models
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"metadata": {},
|
|
6
|
+
"source": [
|
|
7
|
+
"# structflo.ner — Quickstart\n",
|
|
8
|
+
"\n",
|
|
9
|
+
"This notebook walks through the core features of `structflo.ner`:\n",
|
|
10
|
+
"\n",
|
|
11
|
+
"1. Basic extraction with a cloud model (Gemini)\n",
|
|
12
|
+
"2. Local extraction with Ollama\n",
|
|
13
|
+
"3. Using built-in profiles\n",
|
|
14
|
+
"4. Custom profiles\n",
|
|
15
|
+
"5. Working with results"
|
|
16
|
+
]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"cell_type": "markdown",
|
|
20
|
+
"metadata": {},
|
|
21
|
+
"source": [
|
|
22
|
+
"## Setup\n",
|
|
23
|
+
"\n",
|
|
24
|
+
"```bash\n",
|
|
25
|
+
"uv add structflo-ner\n",
|
|
26
|
+
"# for DataFrame support\n",
|
|
27
|
+
"uv add \"structflo-ner[dataframe]\"\n",
|
|
28
|
+
"```"
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"cell_type": "code",
|
|
33
|
+
"execution_count": null,
|
|
34
|
+
"metadata": {},
|
|
35
|
+
"outputs": [],
|
|
36
|
+
"source": [
|
|
37
|
+
"from structflo.ner import NERExtractor, FULL, CHEMISTRY, BIOLOGY, BIOACTIVITY, DISEASE"
|
|
38
|
+
]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"cell_type": "markdown",
|
|
42
|
+
"metadata": {},
|
|
43
|
+
"source": [
|
|
44
|
+
"## 1. Cloud model (Gemini)\n",
|
|
45
|
+
"\n",
|
|
46
|
+
"The default model is `gemini-2.5-flash`. Pass your API key or set the `GEMINI_API_KEY` environment variable."
|
|
47
|
+
]
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"cell_type": "code",
|
|
51
|
+
"execution_count": null,
|
|
52
|
+
"metadata": {},
|
|
53
|
+
"outputs": [],
|
|
54
|
+
"source": [
|
|
55
|
+
"extractor = NERExtractor(api_key=\"YOUR_GEMINI_KEY\") # or set GEMINI_API_KEY env var\n",
|
|
56
|
+
"\n",
|
|
57
|
+
"text = (\n",
|
|
58
|
+
" \"Gefitinib (ZD1839) is a first-generation EGFR tyrosine kinase inhibitor \"\n",
|
|
59
|
+
" \"with IC50 = 0.033 µM, approved for non-small cell lung cancer (NSCLC). \"\n",
|
|
60
|
+
" \"Its SMILES is COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1OCCCN1CCOCC1.\"\n",
|
|
61
|
+
")\n",
|
|
62
|
+
"\n",
|
|
63
|
+
"result = extractor.extract(text)\n",
|
|
64
|
+
"result"
|
|
65
|
+
]
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"cell_type": "markdown",
|
|
69
|
+
"metadata": {},
|
|
70
|
+
"source": [
|
|
71
|
+
"## 2. Local model via Ollama\n",
|
|
72
|
+
"\n",
|
|
73
|
+
"Run extraction on your own hardware — no API key needed.\n",
|
|
74
|
+
"\n",
|
|
75
|
+
"Make sure Ollama is running locally:\n",
|
|
76
|
+
"```bash\n",
|
|
77
|
+
"ollama serve\n",
|
|
78
|
+
"ollama pull gemma3:27b\n",
|
|
79
|
+
"```"
|
|
80
|
+
]
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"cell_type": "code",
|
|
84
|
+
"execution_count": null,
|
|
85
|
+
"metadata": {},
|
|
86
|
+
"outputs": [],
|
|
87
|
+
"source": [
|
|
88
|
+
"local_extractor = NERExtractor(\n",
|
|
89
|
+
" model_id=\"gemma3:27b\",\n",
|
|
90
|
+
" model_url=\"http://localhost:11434\",\n",
|
|
91
|
+
")\n",
|
|
92
|
+
"\n",
|
|
93
|
+
"result_local = local_extractor.extract(\n",
|
|
94
|
+
" \"Sorafenib is a multi-kinase inhibitor targeting VEGFR-2, PDGFR, and RAF with IC50 values of 90 nM, 57 nM, and 6 nM respectively.\"\n",
|
|
95
|
+
")\n",
|
|
96
|
+
"result_local"
|
|
97
|
+
]
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"cell_type": "markdown",
|
|
101
|
+
"metadata": {},
|
|
102
|
+
"source": [
|
|
103
|
+
"## 3. Built-in profiles\n",
|
|
104
|
+
"\n",
|
|
105
|
+
"Profiles control which entity types are extracted. Use them to focus the model on specific categories."
|
|
106
|
+
]
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"cell_type": "code",
|
|
110
|
+
"execution_count": null,
|
|
111
|
+
"metadata": {},
|
|
112
|
+
"outputs": [],
|
|
113
|
+
"source": [
|
|
114
|
+
"# Extract only chemical entities\n",
|
|
115
|
+
"chem_result = extractor.extract(text, profile=CHEMISTRY)\n",
|
|
116
|
+
"print(\"Compounds:\", chem_result.compounds)\n",
|
|
117
|
+
"print(\"Targets:\", chem_result.targets) # empty — not in CHEMISTRY profile"
|
|
118
|
+
]
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"cell_type": "code",
|
|
122
|
+
"execution_count": null,
|
|
123
|
+
"metadata": {},
|
|
124
|
+
"outputs": [],
|
|
125
|
+
"source": [
|
|
126
|
+
"# Merge profiles to combine entity types\n",
|
|
127
|
+
"combined = CHEMISTRY.merge(BIOLOGY)\n",
|
|
128
|
+
"print(f\"Profile: {combined.name}\")\n",
|
|
129
|
+
"print(f\"Entity classes: {combined.entity_classes}\")\n",
|
|
130
|
+
"\n",
|
|
131
|
+
"combined_result = extractor.extract(text, profile=combined)\n",
|
|
132
|
+
"print(\"Compounds:\", combined_result.compounds)\n",
|
|
133
|
+
"print(\"Targets:\", combined_result.targets)"
|
|
134
|
+
]
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
"cell_type": "markdown",
|
|
138
|
+
"metadata": {},
|
|
139
|
+
"source": [
|
|
140
|
+
"## 4. Custom profiles\n",
|
|
141
|
+
"\n",
|
|
142
|
+
"Define your own extraction profiles for domain-specific use cases."
|
|
143
|
+
]
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"cell_type": "code",
|
|
147
|
+
"execution_count": null,
|
|
148
|
+
"metadata": {},
|
|
149
|
+
"outputs": [],
|
|
150
|
+
"source": [
|
|
151
|
+
"from structflo.ner import EntityProfile\n",
|
|
152
|
+
"\n",
|
|
153
|
+
"kinase_profile = EntityProfile(\n",
|
|
154
|
+
" name=\"kinase_inhibitors\",\n",
|
|
155
|
+
" entity_classes=[\"compound_name\", \"smiles\", \"target\", \"bioactivity\"],\n",
|
|
156
|
+
" prompt=\"Extract kinase inhibitor names, their SMILES strings, kinase targets, and potency values (IC50, Ki, Kd).\",\n",
|
|
157
|
+
" examples=[], # add your own few-shot examples here for best results\n",
|
|
158
|
+
")\n",
|
|
159
|
+
"\n",
|
|
160
|
+
"kinase_result = extractor.extract(text, profile=kinase_profile)\n",
|
|
161
|
+
"kinase_result"
|
|
162
|
+
]
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"cell_type": "markdown",
|
|
166
|
+
"metadata": {},
|
|
167
|
+
"source": [
|
|
168
|
+
"## 5. Working with results"
|
|
169
|
+
]
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"cell_type": "code",
|
|
173
|
+
"execution_count": null,
|
|
174
|
+
"metadata": {},
|
|
175
|
+
"outputs": [],
|
|
176
|
+
"source": [
|
|
177
|
+
"# Access typed entity lists\n",
|
|
178
|
+
"print(\"Compounds:\", result.compounds)\n",
|
|
179
|
+
"print(\"Targets:\", result.targets)\n",
|
|
180
|
+
"print(\"Bioactivities:\", result.bioactivities)\n",
|
|
181
|
+
"print(\"Diseases:\", result.diseases)\n",
|
|
182
|
+
"print(\"Mechanisms:\", result.mechanisms)"
|
|
183
|
+
]
|
|
184
|
+
},
|
|
185
|
+
{
|
|
186
|
+
"cell_type": "code",
|
|
187
|
+
"execution_count": null,
|
|
188
|
+
"metadata": {},
|
|
189
|
+
"outputs": [],
|
|
190
|
+
"source": [
|
|
191
|
+
"# Flat list of all entities\n",
|
|
192
|
+
"for entity in result.all_entities():\n",
|
|
193
|
+
" print(f\"{entity.entity_type:20s} | {entity.text}\")"
|
|
194
|
+
]
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
"cell_type": "code",
|
|
198
|
+
"execution_count": null,
|
|
199
|
+
"metadata": {},
|
|
200
|
+
"outputs": [],
|
|
201
|
+
"source": [
|
|
202
|
+
"# Export to pandas DataFrame\n",
|
|
203
|
+
"df = result.to_dataframe()\n",
|
|
204
|
+
"df"
|
|
205
|
+
]
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
"cell_type": "code",
|
|
209
|
+
"execution_count": null,
|
|
210
|
+
"metadata": {},
|
|
211
|
+
"outputs": [],
|
|
212
|
+
"source": [
|
|
213
|
+
"# Serialize to dict (useful for JSON export)\n",
|
|
214
|
+
"import json\n",
|
|
215
|
+
"\n",
|
|
216
|
+
"print(json.dumps(result.to_dict(), indent=2))"
|
|
217
|
+
]
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"cell_type": "markdown",
|
|
221
|
+
"metadata": {},
|
|
222
|
+
"source": [
|
|
223
|
+
"## 6. Batch extraction\n",
|
|
224
|
+
"\n",
|
|
225
|
+
"Pass a list of texts to extract from multiple documents."
|
|
226
|
+
]
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
"cell_type": "code",
|
|
230
|
+
"execution_count": null,
|
|
231
|
+
"metadata": {},
|
|
232
|
+
"outputs": [],
|
|
233
|
+
"source": [
|
|
234
|
+
"texts = [\n",
|
|
235
|
+
" \"Imatinib inhibits BCR-ABL with IC50 = 0.6 µM in CML.\",\n",
|
|
236
|
+
" \"Trastuzumab targets HER2 in breast cancer patients.\",\n",
|
|
237
|
+
" \"Remdesivir (GS-5734) is an antiviral with EC50 = 0.77 µM against SARS-CoV-2.\",\n",
|
|
238
|
+
"]\n",
|
|
239
|
+
"\n",
|
|
240
|
+
"results = extractor.extract(texts)\n",
|
|
241
|
+
"\n",
|
|
242
|
+
"for i, r in enumerate(results):\n",
|
|
243
|
+
" print(f\"\\n--- Text {i+1} ---\")\n",
|
|
244
|
+
" for entity in r.all_entities():\n",
|
|
245
|
+
" print(f\" {entity.entity_type:20s} | {entity.text}\")"
|
|
246
|
+
]
|
|
247
|
+
}
|
|
248
|
+
],
|
|
249
|
+
"metadata": {
|
|
250
|
+
"kernelspec": {
|
|
251
|
+
"display_name": "Python 3",
|
|
252
|
+
"language": "python",
|
|
253
|
+
"name": "python3"
|
|
254
|
+
},
|
|
255
|
+
"language_info": {
|
|
256
|
+
"name": "python",
|
|
257
|
+
"version": "3.12.0"
|
|
258
|
+
}
|
|
259
|
+
},
|
|
260
|
+
"nbformat": 4,
|
|
261
|
+
"nbformat_minor": 4
|
|
262
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "structflo-ner"
|
|
3
|
+
dynamic = ["version"]
|
|
4
|
+
description = "Drug discovery NER wrapper around LangExtract — zero-config entity extraction for chemistry and biology."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = { text = "Apache-2.0" }
|
|
8
|
+
dependencies = [
|
|
9
|
+
"langextract>=1.1.1",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[project.optional-dependencies]
|
|
13
|
+
dataframe = ["pandas>=1.5"]
|
|
14
|
+
|
|
15
|
+
[dependency-groups]
|
|
16
|
+
dev = [
|
|
17
|
+
"pytest>=7",
|
|
18
|
+
"pytest-mock>=3",
|
|
19
|
+
"ruff>=0.8",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["hatchling"]
|
|
24
|
+
build-backend = "hatchling.build"
|
|
25
|
+
|
|
26
|
+
[tool.hatch.version]
|
|
27
|
+
path = "structflo/ner/__init__.py"
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["structflo"]
|
|
31
|
+
|
|
32
|
+
[tool.pytest.ini_options]
|
|
33
|
+
testpaths = ["tests"]
|
|
34
|
+
|
|
35
|
+
# --------------------------------------------------------------------------- #
|
|
36
|
+
# Ruff
|
|
37
|
+
# --------------------------------------------------------------------------- #
|
|
38
|
+
[tool.ruff]
|
|
39
|
+
target-version = "py310"
|
|
40
|
+
line-length = 100
|
|
41
|
+
|
|
42
|
+
[tool.ruff.lint]
|
|
43
|
+
select = [
|
|
44
|
+
"E", # pycodestyle errors
|
|
45
|
+
"W", # pycodestyle warnings
|
|
46
|
+
"F", # pyflakes
|
|
47
|
+
"I", # isort
|
|
48
|
+
"UP", # pyupgrade
|
|
49
|
+
"B", # flake8-bugbear
|
|
50
|
+
"SIM", # flake8-simplify
|
|
51
|
+
]
|
|
52
|
+
ignore = [
|
|
53
|
+
"E501", # line length handled by formatter
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint.isort]
|
|
57
|
+
known-first-party = ["structflo"]
|
|
58
|
+
|
|
59
|
+
[tool.ruff.format]
|
|
60
|
+
quote-style = "double"
|