ppi-analyser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ppi_analyser-0.1.0/MANIFEST.in +4 -0
- ppi_analyser-0.1.0/PKG-INFO +100 -0
- ppi_analyser-0.1.0/README.md +49 -0
- ppi_analyser-0.1.0/ppi_analyser/__init__.py +0 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/__init__.py +0 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/analysis_cache.py +74 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/expansion.py +140 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/mistral_batch_pipeline.py +288 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/modifier_rules.yaml +27 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/modifiers.py +132 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/pipeline.py +422 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/position.py +96 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/prompts.py +206 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/results.py +121 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/sentence.py +443 -0
- ppi_analyser-0.1.0/ppi_analyser/analysis/system_prompts.txt +652 -0
- ppi_analyser-0.1.0/ppi_analyser/config.py +81 -0
- ppi_analyser-0.1.0/ppi_analyser/core.py +107 -0
- ppi_analyser-0.1.0/ppi_analyser/example_usage.py +25 -0
- ppi_analyser-0.1.0/ppi_analyser/exceptions.py +29 -0
- ppi_analyser-0.1.0/ppi_analyser/exporters/__init__.py +0 -0
- ppi_analyser-0.1.0/ppi_analyser/exporters/excel.py +582 -0
- ppi_analyser-0.1.0/ppi_analyser/exporters/pdf.py +442 -0
- ppi_analyser-0.1.0/ppi_analyser/logger.py +27 -0
- ppi_analyser-0.1.0/ppi_analyser/models/__init__.py +0 -0
- ppi_analyser-0.1.0/ppi_analyser/models/base.py +10 -0
- ppi_analyser-0.1.0/ppi_analyser/models/deepseek.py +19 -0
- ppi_analyser-0.1.0/ppi_analyser/models/dummy.py +7 -0
- ppi_analyser-0.1.0/ppi_analyser/models/factory.py +44 -0
- ppi_analyser-0.1.0/ppi_analyser/models/gemini.py +13 -0
- ppi_analyser-0.1.0/ppi_analyser/models/groq.py +21 -0
- ppi_analyser-0.1.0/ppi_analyser/models/mistral.py +199 -0
- ppi_analyser-0.1.0/ppi_analyser/models/no_model.py +92 -0
- ppi_analyser-0.1.0/ppi_analyser/models/ollama.py +23 -0
- ppi_analyser-0.1.0/ppi_analyser/preprocessing/__init__.py +0 -0
- ppi_analyser-0.1.0/ppi_analyser/preprocessing/conversation.py +130 -0
- ppi_analyser-0.1.0/ppi_analyser/preprocessing/detect_narration.py +158 -0
- ppi_analyser-0.1.0/ppi_analyser/preprocessing/segmentation.py +284 -0
- ppi_analyser-0.1.0/ppi_analyser/preprocessing/segmentation_cache.py +65 -0
- ppi_analyser-0.1.0/ppi_analyser/preprocessing/speakers.py +76 -0
- ppi_analyser-0.1.0/ppi_analyser/prompts_batch.txt +957 -0
- ppi_analyser-0.1.0/ppi_analyser/server.py +626 -0
- ppi_analyser-0.1.0/ppi_analyser/stanza/gunicorn_config.py +6 -0
- ppi_analyser-0.1.0/ppi_analyser/stanza/stanza_api.py +123 -0
- ppi_analyser-0.1.0/ppi_analyser/stanza/stanza_api_proxy.py +93 -0
- ppi_analyser-0.1.0/ppi_analyser/stanza/stanza_client.py +56 -0
- ppi_analyser-0.1.0/ppi_analyser/stanza/wsgi.py +5 -0
- ppi_analyser-0.1.0/ppi_analyser/state.py +38 -0
- ppi_analyser-0.1.0/ppi_analyser/test.py +189 -0
- ppi_analyser-0.1.0/ppi_analyser.egg-info/PKG-INFO +100 -0
- ppi_analyser-0.1.0/ppi_analyser.egg-info/SOURCES.txt +57 -0
- ppi_analyser-0.1.0/ppi_analyser.egg-info/dependency_links.txt +1 -0
- ppi_analyser-0.1.0/ppi_analyser.egg-info/entry_points.txt +2 -0
- ppi_analyser-0.1.0/ppi_analyser.egg-info/requires.txt +30 -0
- ppi_analyser-0.1.0/ppi_analyser.egg-info/top_level.txt +4 -0
- ppi_analyser-0.1.0/pyproject.toml +79 -0
- ppi_analyser-0.1.0/setup.cfg +4 -0
- ppi_analyser-0.1.0/windows_installer/docker-compose.yml +22 -0
- ppi_analyser-0.1.0/windows_installer/setup_ppi.py +475 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ppi-analyser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Analyse automatique de phrases préfabriquées d'interaction (PPI) — pipeline NLP + serveur FastAPI
|
|
5
|
+
Author: Youssef Zeroual
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/YoussefZeroual/ppi-analyser
|
|
8
|
+
Project-URL: Repository, https://github.com/YoussefZeroual/ppi-analyser
|
|
9
|
+
Project-URL: Issues, https://github.com/YoussefZeroual/ppi-analyser/issues
|
|
10
|
+
Keywords: nlp,phraseology,french,corpus-linguistics,fastapi,stanza
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: fastapi>=0.111.0
|
|
23
|
+
Requires-Dist: uvicorn[standard]>=0.29.0
|
|
24
|
+
Requires-Dist: python-multipart>=0.0.9
|
|
25
|
+
Requires-Dist: flask>=3.0.0
|
|
26
|
+
Requires-Dist: flask-cors>=4.0.0
|
|
27
|
+
Requires-Dist: torch>=2.3.1
|
|
28
|
+
Requires-Dist: stanza>=1.8.0
|
|
29
|
+
Requires-Dist: nltk>=3.8.1
|
|
30
|
+
Requires-Dist: mistralai>=1.0.0
|
|
31
|
+
Requires-Dist: openai>=1.30.0
|
|
32
|
+
Requires-Dist: ollama>=0.2.0
|
|
33
|
+
Requires-Dist: google-generativeai>=0.7.0
|
|
34
|
+
Requires-Dist: groq>=0.9.0
|
|
35
|
+
Requires-Dist: pandas>=2.2.0
|
|
36
|
+
Requires-Dist: openpyxl>=3.1.2
|
|
37
|
+
Requires-Dist: xlsxwriter>=3.2.0
|
|
38
|
+
Requires-Dist: reportlab>=4.1.0
|
|
39
|
+
Requires-Dist: markdown
|
|
40
|
+
Requires-Dist: weasyprint
|
|
41
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
42
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
43
|
+
Requires-Dist: requests>=2.31.0
|
|
44
|
+
Requires-Dist: httpx>=0.27.0
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: pytest; extra == "dev"
|
|
47
|
+
Requires-Dist: mypy; extra == "dev"
|
|
48
|
+
Requires-Dist: ruff; extra == "dev"
|
|
49
|
+
Requires-Dist: build; extra == "dev"
|
|
50
|
+
Requires-Dist: twine; extra == "dev"
|
|
51
|
+
|
|
52
|
+
# PPI Analyser
|
|
53
|
+
|
|
54
|
+
Outil d'analyse automatique de phrases préfabriquées d'interaction (PPI) françaises.
|
|
55
|
+
|
|
56
|
+
## Installation
|
|
57
|
+
```bash
|
|
58
|
+
python -m venv myenv
|
|
59
|
+
source myenv/bin/activate
|
|
60
|
+
pip install -e .
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Configuration
|
|
64
|
+
|
|
65
|
+
Copy `.env.example` to `.env` and fill in your API keys:
|
|
66
|
+
```bash
|
|
67
|
+
cp .env.example .env
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
```python
|
|
72
|
+
from ppi_analyser.core import PPIAnalyser
|
|
73
|
+
from ppi_analyser.config import PipelineConfig, AnalysisMode
|
|
74
|
+
|
|
75
|
+
analyser = PPIAnalyser()
|
|
76
|
+
config = PipelineConfig(
|
|
77
|
+
models=["mistral_mistral-medium-latest"],
|
|
78
|
+
expression="je t'en prie",
|
|
79
|
+
sentence_file="path/to/corpus.xlsx",
|
|
80
|
+
mode=AnalysisMode.ORAL,
|
|
81
|
+
output_dir="path/to/output",
|
|
82
|
+
)
|
|
83
|
+
df, state = analyser.process_sentences(config)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Modes
|
|
87
|
+
|
|
88
|
+
- `ORAL` — corpus oral
|
|
89
|
+
- `ECRIT` — corpus écrit (segmentation automatique)
|
|
90
|
+
- `ECRIT_IA` — corpus écrit avec détection des tours de parole par LLM
|
|
91
|
+
- `ECRIT_TEST` — mode test rapide
|
|
92
|
+
|
|
93
|
+
## Models supported
|
|
94
|
+
|
|
95
|
+
- Mistral (`mistral_mistral-medium-latest`)
|
|
96
|
+
- Ollama (`ollama_mistral:7b`)
|
|
97
|
+
- Groq (`groq_moonshotai/kimi-k2-instruct`)
|
|
98
|
+
- DeepSeek (`deepseek_deepseek-chat`)
|
|
99
|
+
- Gemini (`gemini_gemini-3-flash-preview`)
|
|
100
|
+
# ppi_analyser
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# PPI Analyser
|
|
2
|
+
|
|
3
|
+
Outil d'analyse automatique de phrases préfabriquées d'interaction (PPI) françaises.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
```bash
|
|
7
|
+
python -m venv myenv
|
|
8
|
+
source myenv/bin/activate
|
|
9
|
+
pip install -e .
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Configuration
|
|
13
|
+
|
|
14
|
+
Copy `.env.example` to `.env` and fill in your API keys:
|
|
15
|
+
```bash
|
|
16
|
+
cp .env.example .env
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
```python
|
|
21
|
+
from ppi_analyser.core import PPIAnalyser
|
|
22
|
+
from ppi_analyser.config import PipelineConfig, AnalysisMode
|
|
23
|
+
|
|
24
|
+
analyser = PPIAnalyser()
|
|
25
|
+
config = PipelineConfig(
|
|
26
|
+
models=["mistral_mistral-medium-latest"],
|
|
27
|
+
expression="je t'en prie",
|
|
28
|
+
sentence_file="path/to/corpus.xlsx",
|
|
29
|
+
mode=AnalysisMode.ORAL,
|
|
30
|
+
output_dir="path/to/output",
|
|
31
|
+
)
|
|
32
|
+
df, state = analyser.process_sentences(config)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Modes
|
|
36
|
+
|
|
37
|
+
- `ORAL` — corpus oral
|
|
38
|
+
- `ECRIT` — corpus écrit (segmentation automatique)
|
|
39
|
+
- `ECRIT_IA` — corpus écrit avec détection des tours de parole par LLM
|
|
40
|
+
- `ECRIT_TEST` — mode test rapide
|
|
41
|
+
|
|
42
|
+
## Models supported
|
|
43
|
+
|
|
44
|
+
- Mistral (`mistral_mistral-medium-latest`)
|
|
45
|
+
- Ollama (`ollama_mistral:7b`)
|
|
46
|
+
- Groq (`groq_moonshotai/kimi-k2-instruct`)
|
|
47
|
+
- DeepSeek (`deepseek_deepseek-chat`)
|
|
48
|
+
- Gemini (`gemini_gemini-3-flash-preview`)
|
|
49
|
+
# ppi_analyser
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# analysis/analysis_cache.py
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import hashlib
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
_cache: dict | None = None
|
|
11
|
+
_cache_path: Path | None = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def init(cache_path: str) -> None:
|
|
15
|
+
"""Call once at the start of a run to set the cache path and load existing entries."""
|
|
16
|
+
global _cache, _cache_path
|
|
17
|
+
_cache_path = Path(cache_path)
|
|
18
|
+
if _cache_path.exists():
|
|
19
|
+
try:
|
|
20
|
+
_cache = json.loads(_cache_path.read_text(encoding="utf-8"))
|
|
21
|
+
logger.info("Analysis cache loaded: %d entries from %s", len(_cache), _cache_path)
|
|
22
|
+
except Exception as e:
|
|
23
|
+
logger.warning("Could not load analysis cache: %s — starting fresh", e)
|
|
24
|
+
_cache = {}
|
|
25
|
+
else:
|
|
26
|
+
_cache = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _require_init() -> None:
|
|
30
|
+
if _cache is None:
|
|
31
|
+
raise RuntimeError(
|
|
32
|
+
"Analysis cache not initialised — call analysis_cache.init(path) first, "
|
|
33
|
+
"or set use_analysis_cache=True and analysis_cache_path in PipelineConfig."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _key(conversation: str, expression: str, model: str, submodel: str, prompt_type: str) -> str:
|
|
38
|
+
payload = f"{conversation}|{expression}|{model}|{submodel}|{prompt_type}"
|
|
39
|
+
return hashlib.md5(payload.encode("utf-8")).hexdigest()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get(conversation: str, expression: str, model: str, submodel: str, prompt_type: str) -> str | None:
|
|
43
|
+
_require_init()
|
|
44
|
+
result = _cache.get(_key(conversation, expression, model, submodel, prompt_type))
|
|
45
|
+
if result is not None:
|
|
46
|
+
logger.debug("Analysis cache HIT — %s / %s / %s", prompt_type, model, expression[:40])
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def set(conversation: str, expression: str, model: str, submodel: str, prompt_type: str, result: str) -> None:
|
|
51
|
+
_require_init()
|
|
52
|
+
_cache[_key(conversation, expression, model, submodel, prompt_type)] = result
|
|
53
|
+
_save()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _save() -> None:
|
|
57
|
+
try:
|
|
58
|
+
_cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
_cache_path.write_text(json.dumps(_cache, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.warning("Could not save analysis cache: %s", e)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def cache_size() -> int:
|
|
65
|
+
_require_init()
|
|
66
|
+
return len(_cache)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def clear() -> None:
|
|
70
|
+
global _cache
|
|
71
|
+
_require_init()
|
|
72
|
+
_cache = {}
|
|
73
|
+
_save()
|
|
74
|
+
logger.info("Analysis cache cleared")
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import re
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from ppi_analyser.stanza.stanza_client import StanzaClient
|
|
5
|
+
from ppi_analyser.exporters.excel import format_ppi_bold
|
|
6
|
+
|
|
7
|
+
client = StanzaClient()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def extract_ppi_sentence(tagged_line):
|
|
11
|
+
match = re.search(r'<PPI>(.*?)</PPI>', tagged_line, re.IGNORECASE)
|
|
12
|
+
if not match:
|
|
13
|
+
return None, None
|
|
14
|
+
ppi_text = match.group(1).strip()
|
|
15
|
+
# Trouve la limite gauche : dernier séparateur avant la balise
|
|
16
|
+
#pre = tagged_line[:match.start()]
|
|
17
|
+
post = tagged_line[match.end():]
|
|
18
|
+
# Coupe à gauche sur / ou début de ligne
|
|
19
|
+
#left = re.split(r'/', pre)[-1]
|
|
20
|
+
# Coupe à droite sur /
|
|
21
|
+
right = re.split(r'/', post)[0]
|
|
22
|
+
clean_seg = re.sub(r'</?PPI>', '', right, flags=re.IGNORECASE).strip() # <-- removed left be cause exp is always in the right
|
|
23
|
+
return ppi_text, clean_seg
|
|
24
|
+
|
|
25
|
+
def get_ppi_ids(sentence, ppi_text):
|
|
26
|
+
words = sentence.words
|
|
27
|
+
ppi_clean = re.sub(r'\s*-\s*', '-', ppi_text.lower()).strip()
|
|
28
|
+
for i in range(len(words)):
|
|
29
|
+
for j in range(i+1, len(words)+1):
|
|
30
|
+
window = words[i:j]
|
|
31
|
+
surface = re.sub(r'\s*-\s*', '-', " ".join(w.text for w in window).lower())
|
|
32
|
+
if surface == ppi_clean:
|
|
33
|
+
return set(w.id for w in window)
|
|
34
|
+
return set()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_ppi_head(sentence, ppi_ids):
|
|
38
|
+
for w in sentence.words:
|
|
39
|
+
if w.id in ppi_ids and w.head not in ppi_ids:
|
|
40
|
+
return w
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_subtree(head_word, words, exclude_ids=set()):
|
|
45
|
+
subtree_ids = {head_word.id}
|
|
46
|
+
changed = True
|
|
47
|
+
while changed:
|
|
48
|
+
changed = False
|
|
49
|
+
for w in words:
|
|
50
|
+
if w.head in subtree_ids and w.id not in subtree_ids and w.id not in exclude_ids:
|
|
51
|
+
subtree_ids.add(w.id)
|
|
52
|
+
changed = True
|
|
53
|
+
return sorted([w for w in words if w.id in subtree_ids], key=lambda w: w.id)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_expansion_from_sentence(sentence, ppi_text):
|
|
57
|
+
import logging
|
|
58
|
+
logger = logging.getLogger(__name__)
|
|
59
|
+
|
|
60
|
+
#logger.warning("%s",[f"{w.text}_{w.upos}:{w.deprel}" for w in sentence.words])
|
|
61
|
+
ppi_ids = get_ppi_ids(sentence, ppi_text)
|
|
62
|
+
if not ppi_ids:
|
|
63
|
+
return [{"type": None, "tokens": []}]
|
|
64
|
+
ppi_head = get_ppi_head(sentence, ppi_ids)
|
|
65
|
+
if not ppi_head:
|
|
66
|
+
return [{"type": None, "tokens": []}]
|
|
67
|
+
words = sentence.words
|
|
68
|
+
dependants = [w for w in words if w.head == ppi_head.id and w.id not in ppi_ids]
|
|
69
|
+
expansions = []
|
|
70
|
+
#logger.warning("%s",[f"{dep.text}_{dep.deprel}_{dep.upos}" for dep in dependants])
|
|
71
|
+
for dep in dependants:
|
|
72
|
+
deprel = dep.deprel
|
|
73
|
+
upos = dep.upos
|
|
74
|
+
if deprel == "xcomp" and upos == "VERB":
|
|
75
|
+
subtree = get_subtree(dep, words, exclude_ids=ppi_ids)
|
|
76
|
+
expansions.append({"type": "infinitive", "tokens": subtree})
|
|
77
|
+
elif deprel in ("ccomp", "csubj"):
|
|
78
|
+
subtree = get_subtree(dep, words, exclude_ids=ppi_ids)
|
|
79
|
+
expansions.append({"type": "completive_que", "tokens": subtree})
|
|
80
|
+
elif deprel in ("nmod", "obl", "obl:arg", "obj","advcl") and upos in ("NOUN", "PRON", "VERB"):
|
|
81
|
+
subtree = get_subtree(dep, words, exclude_ids=ppi_ids)
|
|
82
|
+
expansions.append({"type": "nominal_prep", "tokens": subtree})
|
|
83
|
+
return expansions[:1] if expansions else [{"type": None, "tokens": []}]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def detect_expansion(doc, ppi_text, occurrence=0):
|
|
87
|
+
count = 0
|
|
88
|
+
for sentence in doc.sentences:
|
|
89
|
+
result = get_expansion_from_sentence(sentence, ppi_text)
|
|
90
|
+
if result[0]["type"] is not None or result[0]["tokens"]:
|
|
91
|
+
if count == occurrence:
|
|
92
|
+
return result
|
|
93
|
+
count += 1
|
|
94
|
+
return [{"type": None, "tokens": []}]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def process_file(input_path):
|
|
98
|
+
if input_path.endswith(".csv"):
|
|
99
|
+
df_in = pd.read_csv(input_path)
|
|
100
|
+
else:
|
|
101
|
+
df_in = pd.read_excel(input_path)
|
|
102
|
+
print(df_in.columns.tolist())
|
|
103
|
+
print(df_in.head(2))
|
|
104
|
+
rows = []
|
|
105
|
+
for _, row in df_in.iterrows():
|
|
106
|
+
conversation = str(row.get("Conversation", ""))
|
|
107
|
+
lines = conversation.split("\n")
|
|
108
|
+
for line in lines:
|
|
109
|
+
if not re.search(r'<PPI>', line, re.IGNORECASE):
|
|
110
|
+
continue
|
|
111
|
+
ppi_text, clean_seg = extract_ppi_sentence(line)
|
|
112
|
+
if not ppi_text:
|
|
113
|
+
continue
|
|
114
|
+
expansions = detect_expansion(clean_seg, ppi_text)
|
|
115
|
+
exp = expansions[0]
|
|
116
|
+
expansion_text = " ".join(w["text"] for w in exp["tokens"]) if exp["tokens"] else ""
|
|
117
|
+
print(f"Tour : {line.strip()}")
|
|
118
|
+
print(f" Type : {exp['type']}")
|
|
119
|
+
print(f" Expansion: {expansion_text}\n")
|
|
120
|
+
rows.append({
|
|
121
|
+
"Tour": line.strip(),
|
|
122
|
+
"PPI": ppi_text,
|
|
123
|
+
"Type_expansion_1": exp["type"] if len(expansions) > 0 else "",
|
|
124
|
+
"Expansion_1": expansion_text,
|
|
125
|
+
"Type_expansion_2": expansions[1]["type"] if len(expansions) > 1 else "",
|
|
126
|
+
"Expansion_2": " ".join(w["text"] for w in expansions[1]["tokens"]) if len(expansions) > 1 and expansions[1]["tokens"] else "",
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
df_out = pd.DataFrame(rows)
|
|
130
|
+
base = re.sub(r'\.(xlsx|csv)$', '', input_path)
|
|
131
|
+
output_path = f"{base}_expansion.xlsx"
|
|
132
|
+
format_ppi_bold(df_out, output_path)
|
|
133
|
+
print(f"Résultat enregistré : {output_path}")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == "__main__":
|
|
137
|
+
if len(sys.argv) < 2:
|
|
138
|
+
print("Usage: python expansion.py <fichier.csv|xlsx>")
|
|
139
|
+
sys.exit(1)
|
|
140
|
+
process_file(sys.argv[1])
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
# analysis/mistral_batch_pipeline.py
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ppi_analyser.analysis.pipeline import PreprocessedSentence, _chunk
|
|
8
|
+
from ppi_analyser.analysis.prompts import get_prompts_batch, get_prompt_type
|
|
9
|
+
from ppi_analyser.analysis.sentence import _handle_no_model_batch, _parse_batch_response
|
|
10
|
+
from ppi_analyser.models.factory import get_mistral_batch_provider
|
|
11
|
+
from ppi_analyser.config import PipelineConfig
|
|
12
|
+
from ppi_analyser.state import SessionState
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
#NON_IA = dict(state.no_ia)#{0, 1, 5, 7,8} # Forme, Lemme, Position, expansion — handled locally, not submitted to Mistral
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _custom_id(chunk_idx: int, prop_idx: int) -> str:
|
|
20
|
+
return f"c{chunk_idx}_p{prop_idx}"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Main entry point
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
def analyse_batch_mistral_async(
|
|
28
|
+
preprocessed: list[PreprocessedSentence],
|
|
29
|
+
lemmes: list[str] | None,
|
|
30
|
+
config: PipelineConfig,
|
|
31
|
+
state: SessionState,
|
|
32
|
+
) -> tuple[list[PreprocessedSentence], list[list[str]]]:
|
|
33
|
+
NON_IA = set(state.no_ia)
|
|
34
|
+
|
|
35
|
+
submodel = _resolve_submodel(config.models)
|
|
36
|
+
provider = get_mistral_batch_provider(submodel, config.output_dir)
|
|
37
|
+
|
|
38
|
+
# Check for saved job state from a previous interrupted run
|
|
39
|
+
saved = provider.load_job_state()
|
|
40
|
+
if saved:
|
|
41
|
+
logger.info("Resuming Mistral batch jobs from saved state")
|
|
42
|
+
all_results = _poll_and_assemble(saved["job_map"], provider, preprocessed, lemmes, config, state)
|
|
43
|
+
if all_results is None:
|
|
44
|
+
raise InterruptedError("Mistral batch jobs still running. Re-run to resume.")
|
|
45
|
+
provider.clear_job_state()
|
|
46
|
+
return preprocessed, all_results
|
|
47
|
+
|
|
48
|
+
# Save preprocessed so we can reassemble on resume
|
|
49
|
+
preprocessed_path = Path(config.output_dir) / "mistral_batch_preprocessed.json"
|
|
50
|
+
_save_preprocessed(preprocessed, lemmes, preprocessed_path)
|
|
51
|
+
|
|
52
|
+
chunks = _chunk(preprocessed, config.batch_size)
|
|
53
|
+
lemme_chunks = _chunk(lemmes, config.batch_size) if lemmes else [None] * len(chunks)
|
|
54
|
+
|
|
55
|
+
# Submit one job per (chunk × property), skip NON_IA
|
|
56
|
+
job_map = {} # "c{i}_p{j}" -> job_id
|
|
57
|
+
cached_results = {} # "c{i}_p{j}" -> raw_response (from analysis cache)
|
|
58
|
+
prompt_map = {} # "c{i}_p{j}" -> {user, prompt_type} for cache saving
|
|
59
|
+
|
|
60
|
+
for chunk_idx, (chunk, lemme_chunk) in enumerate(zip(chunks, lemme_chunks)):
|
|
61
|
+
expression = lemme_chunk[0] if lemme_chunk else config.expression
|
|
62
|
+
if lemme_chunk:
|
|
63
|
+
state.expression_list.extend(lemme_chunk)
|
|
64
|
+
|
|
65
|
+
system_prompts, batched_prompts = get_prompts_batch(
|
|
66
|
+
expression=expression,
|
|
67
|
+
forme_relevee_list=[s.forme_relevee for s in chunk],
|
|
68
|
+
conv_list=[s.cleaned for s in chunk],
|
|
69
|
+
locuteur_list=[s.locuteur for s in chunk],
|
|
70
|
+
interlocuteur_list=[s.interlocuteurs for s in chunk],
|
|
71
|
+
mode=config.mode,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
for prop_idx, (system_prompt, batched_prompt) in enumerate(
|
|
75
|
+
zip(system_prompts, batched_prompts)
|
|
76
|
+
):
|
|
77
|
+
if prop_idx in NON_IA:
|
|
78
|
+
continue
|
|
79
|
+
prompt_type = get_prompt_type(system_prompt)
|
|
80
|
+
if config.properties and prompt_type not in config.properties:
|
|
81
|
+
continue
|
|
82
|
+
if state.custom_properties_list is not None and prompt_type not in state.custom_properties_list: # fixed added custom props
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
cid = _custom_id(chunk_idx, prop_idx)
|
|
86
|
+
|
|
87
|
+
# Check analysis cache before submitting
|
|
88
|
+
if state.use_analysis_cache:
|
|
89
|
+
from ppi_analyser.analysis.analysis_cache import get as acache_get
|
|
90
|
+
cached = acache_get(batched_prompt, "", "mistral_batch", submodel, prompt_type)
|
|
91
|
+
if cached is not None:
|
|
92
|
+
logger.debug("Analysis cache HIT for %s chunk %d prop %d", prompt_type, chunk_idx, prop_idx)
|
|
93
|
+
cached_results[cid] = cached
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
job_id = provider.submit([{
|
|
97
|
+
"custom_id": cid,
|
|
98
|
+
"system": system_prompt,
|
|
99
|
+
"user": batched_prompt,
|
|
100
|
+
}])
|
|
101
|
+
job_map[cid] = job_id
|
|
102
|
+
prompt_map[cid] = {"user": batched_prompt, "prompt_type": prompt_type}
|
|
103
|
+
logger.info("Submitted job %s for chunk %d prop %d", job_id, chunk_idx, prop_idx)
|
|
104
|
+
|
|
105
|
+
# Poll all jobs
|
|
106
|
+
all_results = _poll_and_assemble(job_map, provider, preprocessed, lemmes, config, state,
|
|
107
|
+
preprocessed_path=str(preprocessed_path),
|
|
108
|
+
cached_results=cached_results,
|
|
109
|
+
submodel=submodel,
|
|
110
|
+
prompt_map=prompt_map)
|
|
111
|
+
if all_results is None:
|
|
112
|
+
raise InterruptedError("Mistral batch jobs still running. Re-run to resume.")
|
|
113
|
+
|
|
114
|
+
provider.clear_job_state()
|
|
115
|
+
return preprocessed, all_results
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
# Poll + assemble
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
def _poll_and_assemble(
|
|
123
|
+
job_map: dict,
|
|
124
|
+
provider,
|
|
125
|
+
preprocessed: list[PreprocessedSentence],
|
|
126
|
+
lemmes: list[str] | None,
|
|
127
|
+
config: PipelineConfig,
|
|
128
|
+
state: SessionState,
|
|
129
|
+
preprocessed_path: str | None = None,
|
|
130
|
+
cached_results: dict | None = None,
|
|
131
|
+
submodel: str = "",
|
|
132
|
+
prompt_map: dict | None = None,
|
|
133
|
+
) -> list[list[str]] | None:
|
|
134
|
+
"""
|
|
135
|
+
Poll all jobs in job_map. Returns assembled results or None if any timed out.
|
|
136
|
+
job_map: {"c{i}_p{j}": job_id}
|
|
137
|
+
"""
|
|
138
|
+
raw_results = dict(cached_results) if cached_results else {}
|
|
139
|
+
timed_out = []
|
|
140
|
+
|
|
141
|
+
for cid, job_id in job_map.items():
|
|
142
|
+
result = provider.poll_or_save(job_id, preprocessed_json=preprocessed_path)
|
|
143
|
+
if result is None:
|
|
144
|
+
timed_out.append(cid)
|
|
145
|
+
else:
|
|
146
|
+
raw_response = result.get(cid, "")
|
|
147
|
+
raw_results[cid] = raw_response
|
|
148
|
+
# Save to analysis cache using prompt_map for key info
|
|
149
|
+
if state.use_analysis_cache and raw_response and prompt_map and cid in prompt_map:
|
|
150
|
+
from ppi_analyser.analysis.analysis_cache import set as acache_set
|
|
151
|
+
entry = prompt_map[cid]
|
|
152
|
+
acache_set(entry["user"], "", "mistral_batch", submodel,
|
|
153
|
+
entry["prompt_type"], raw_response)
|
|
154
|
+
logger.debug("Analysis cache SET for %s", entry["prompt_type"])
|
|
155
|
+
|
|
156
|
+
if timed_out:
|
|
157
|
+
combined = {"job_map": job_map, "preprocessed_json": preprocessed_path}
|
|
158
|
+
state_path = Path(config.output_dir) / "mistral_batch_job.json"
|
|
159
|
+
state_path.write_text(json.dumps(combined, indent=2), encoding="utf-8")
|
|
160
|
+
logger.warning("%d jobs timed out. State saved to %s", len(timed_out), state_path)
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
return _assemble(raw_results, preprocessed, lemmes, config, state)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
# Assemble
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
def _assemble(
|
|
172
|
+
raw_results: dict,
|
|
173
|
+
preprocessed: list[PreprocessedSentence],
|
|
174
|
+
lemmes: list[str] | None,
|
|
175
|
+
config: PipelineConfig,
|
|
176
|
+
state: SessionState,
|
|
177
|
+
) -> list[list[str]]:
|
|
178
|
+
"""
|
|
179
|
+
Reassemble per-sentence results from raw_results.
|
|
180
|
+
raw_results: {"c{chunk}_p{prop}": raw_response_string}
|
|
181
|
+
Returns results_per_sentence[sent][prop].
|
|
182
|
+
"""
|
|
183
|
+
chunks = _chunk(preprocessed, config.batch_size)
|
|
184
|
+
lemme_chunks = _chunk(lemmes, config.batch_size) if lemmes else [None] * len(chunks)
|
|
185
|
+
all_results = []
|
|
186
|
+
from ppi_analyser.config import AnalysisMode
|
|
187
|
+
if state.analysis_mode ==AnalysisMode.ORAL:
|
|
188
|
+
NON_IA = [0,1,5]
|
|
189
|
+
else:
|
|
190
|
+
NON_IA = set(state.no_ia)
|
|
191
|
+
for chunk_idx, (chunk, lemme_chunk) in enumerate(zip(chunks, lemme_chunks)):
|
|
192
|
+
n_sents = len(chunk)
|
|
193
|
+
expression = lemme_chunk[0] if lemme_chunk else config.expression
|
|
194
|
+
|
|
195
|
+
system_prompts, _ = get_prompts_batch(
|
|
196
|
+
expression=expression,
|
|
197
|
+
forme_relevee_list=[s.forme_relevee for s in chunk],
|
|
198
|
+
conv_list=[s.cleaned for s in chunk],
|
|
199
|
+
locuteur_list=[s.locuteur for s in chunk],
|
|
200
|
+
interlocuteur_list=[s.interlocuteurs for s in chunk],
|
|
201
|
+
mode=config.mode,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
n_props = len(system_prompts)
|
|
205
|
+
results_per_property = []
|
|
206
|
+
|
|
207
|
+
for prop_idx, system_prompt in enumerate(system_prompts):
|
|
208
|
+
if prop_idx in NON_IA:
|
|
209
|
+
prop_results = _handle_no_model_batch(
|
|
210
|
+
system_prompt=system_prompt,
|
|
211
|
+
conversations=[s.cleaned for s in chunk],
|
|
212
|
+
expression=expression,
|
|
213
|
+
forme_relevee_list=[s.forme_relevee for s in chunk],
|
|
214
|
+
state=state,
|
|
215
|
+
mode=config.mode,
|
|
216
|
+
n_sentences=n_sents,
|
|
217
|
+
start_offset=chunk_idx * config.batch_size
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
cid = _custom_id(chunk_idx, prop_idx)
|
|
221
|
+
prompt_type = get_prompt_type(system_prompt)
|
|
222
|
+
if config.properties and prompt_type not in config.properties:
|
|
223
|
+
prop_results = [None] * n_sents
|
|
224
|
+
elif state.custom_properties_list is not None and prompt_type not in state.custom_properties_list:
|
|
225
|
+
prop_results = [None] * n_sents
|
|
226
|
+
else:
|
|
227
|
+
raw_response = raw_results.get(cid, "")
|
|
228
|
+
if not raw_response:
|
|
229
|
+
similar = [k for k in raw_results if k.startswith(f"c{chunk_idx}_")]
|
|
230
|
+
logger.warning("No result for %s — keys for this chunk: %s", cid, similar)
|
|
231
|
+
prop_results = _parse_batch_response(raw_response, n_sents)
|
|
232
|
+
|
|
233
|
+
results_per_property.append(prop_results)
|
|
234
|
+
|
|
235
|
+
# Transpose: prop × sent -> sent × prop
|
|
236
|
+
for sent_idx in range(n_sents):
|
|
237
|
+
row = []
|
|
238
|
+
for prop_results in results_per_property:
|
|
239
|
+
if prop_results is None:
|
|
240
|
+
row.append(None)
|
|
241
|
+
else:
|
|
242
|
+
row.append(
|
|
243
|
+
prop_results[sent_idx] if sent_idx < len(prop_results) else None
|
|
244
|
+
)
|
|
245
|
+
all_results.append(row)
|
|
246
|
+
|
|
247
|
+
return all_results
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ---------------------------------------------------------------------------
|
|
251
|
+
# Helpers
|
|
252
|
+
# ---------------------------------------------------------------------------
|
|
253
|
+
|
|
254
|
+
def _resolve_submodel(models: list[str]) -> str:
|
|
255
|
+
for m in models:
|
|
256
|
+
if m.startswith("mistral_batch_"):
|
|
257
|
+
return m[len("mistral_batch_"):]
|
|
258
|
+
raise ValueError("No mistral_batch_<submodel> entry found in models list")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _save_preprocessed(preprocessed, lemmes, path: Path) -> None:
|
|
262
|
+
data = {
|
|
263
|
+
"preprocessed": [
|
|
264
|
+
{
|
|
265
|
+
"raw": s.raw,
|
|
266
|
+
"cleaned": s.cleaned,
|
|
267
|
+
"locuteur": s.locuteur,
|
|
268
|
+
"interlocuteurs": s.interlocuteurs,
|
|
269
|
+
"forme_relevee": s.forme_relevee,
|
|
270
|
+
}
|
|
271
|
+
for s in preprocessed
|
|
272
|
+
],
|
|
273
|
+
"lemmes": lemmes,
|
|
274
|
+
}
|
|
275
|
+
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
276
|
+
logger.info("Preprocessed sentences saved to %s", path)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _load_preprocessed(path: str) -> tuple[list[PreprocessedSentence], list[str] | None]:
|
|
280
|
+
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
281
|
+
preprocessed = [
|
|
282
|
+
PreprocessedSentence(
|
|
283
|
+
raw=s["raw"], cleaned=s["cleaned"], locuteur=s["locuteur"],
|
|
284
|
+
interlocuteurs=s["interlocuteurs"], forme_relevee=s["forme_relevee"],
|
|
285
|
+
)
|
|
286
|
+
for s in data["preprocessed"]
|
|
287
|
+
]
|
|
288
|
+
return preprocessed, data.get("lemmes")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Les parties du discours des modifieurs
|
|
2
|
+
upos:
|
|
3
|
+
- ADV
|
|
4
|
+
- ADJ
|
|
5
|
+
- NOUN
|
|
6
|
+
# dépendances syntaxiques par rapport à la tête de la phrase
|
|
7
|
+
deprel:
|
|
8
|
+
- obl:mod
|
|
9
|
+
- nmod
|
|
10
|
+
- acl:relcl
|
|
11
|
+
- dislocated
|
|
12
|
+
- amod
|
|
13
|
+
|
|
14
|
+
# lemmes surajoutés à la PPI standard
|
|
15
|
+
lemma:
|
|
16
|
+
- dieu
|
|
17
|
+
- diable
|
|
18
|
+
|
|
19
|
+
# à exclure
|
|
20
|
+
|
|
21
|
+
excluded_upos:
|
|
22
|
+
- PUNKT
|
|
23
|
+
|
|
24
|
+
excluded_deprel:
|
|
25
|
+
-
|
|
26
|
+
excluded_lemma:
|
|
27
|
+
-
|