usdm4-protocol 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. usdm4_protocol/__info__.py +3 -0
  2. usdm4_protocol/__init__.py +104 -0
  3. usdm4_protocol/common/__init__.py +1 -0
  4. usdm4_protocol/common/ai/__init__.py +1 -0
  5. usdm4_protocol/common/ai/base_ai.py +53 -0
  6. usdm4_protocol/common/ai/claude_provider.py +175 -0
  7. usdm4_protocol/common/ai/fallback_provider.py +34 -0
  8. usdm4_protocol/common/assemble/__init__.py +1 -0
  9. usdm4_protocol/common/assemble/assemble_usdm.py +52 -0
  10. usdm4_protocol/common/base_import.py +60 -0
  11. usdm4_protocol/common/extract/__init__.py +1 -0
  12. usdm4_protocol/common/extract/combined_row_classifier.py +254 -0
  13. usdm4_protocol/common/extract/content_extractor.py +67 -0
  14. usdm4_protocol/common/extract/ie_extractor.py +328 -0
  15. usdm4_protocol/common/extract/section_finder.py +49 -0
  16. usdm4_protocol/common/extract/soa_row_classifier.py +201 -0
  17. usdm4_protocol/common/extract/utility.py +155 -0
  18. usdm4_protocol/common/html/__init__.py +1 -0
  19. usdm4_protocol/common/html/clean_html.py +1 -0
  20. usdm4_protocol/common/html/expand_table.py +133 -0
  21. usdm4_protocol/common/html/soup_utils.py +35 -0
  22. usdm4_protocol/common/load/__init__.py +18 -0
  23. usdm4_protocol/cpt/__init__.py +66 -0
  24. usdm4_protocol/cpt/import_/__init__.py +0 -0
  25. usdm4_protocol/cpt/import_/cpt_import.py +53 -0
  26. usdm4_protocol/cpt/import_/extract/__init__.py +55 -0
  27. usdm4_protocol/cpt/import_/extract/lab_tests.py +397 -0
  28. usdm4_protocol/cpt/import_/extract/title_page.py +197 -0
  29. usdm4_protocol/cpt/import_/load/__init__.py +2 -0
  30. usdm4_protocol/cpt/views/__init__.py +0 -0
  31. usdm4_protocol/cpt/views/document_view.py +49 -0
  32. usdm4_protocol/legacy/__init__.py +41 -0
  33. usdm4_protocol/legacy/import_/__init__.py +0 -0
  34. usdm4_protocol/legacy/import_/extract/__init__.py +132 -0
  35. usdm4_protocol/legacy/import_/extract/inclusion_exclusion.py +182 -0
  36. usdm4_protocol/legacy/import_/extract/schedule_of_activities.py +277 -0
  37. usdm4_protocol/legacy/import_/extract/title_page.py +87 -0
  38. usdm4_protocol/legacy/import_/legacy_import.py +54 -0
  39. usdm4_protocol/legacy/import_/load/__init__.py +19 -0
  40. usdm4_protocol/legacy/import_/load/clean_html.py +216 -0
  41. usdm4_protocol/legacy/import_/load/split_html.py +307 -0
  42. usdm4_protocol/legacy/import_/load/to_html.py +54 -0
  43. usdm4_protocol/legacy/import_/load/to_html_base.py +30 -0
  44. usdm4_protocol/legacy/import_/load/to_html_docling.py +49 -0
  45. usdm4_protocol/legacy/import_/load/to_html_pymupdf.py +203 -0
  46. usdm4_protocol/legacy/views/__init__.py +0 -0
  47. usdm4_protocol/m11/__init__.py +101 -0
  48. usdm4_protocol/m11/data/mapping/title_page_mapping.yaml +1488 -0
  49. usdm4_protocol/m11/data/specification/elements/title_page_elements.yaml +3153 -0
  50. usdm4_protocol/m11/data/specification/sections.yaml +10 -0
  51. usdm4_protocol/m11/data/specification/templates/title_page_template.html +260 -0
  52. usdm4_protocol/m11/elements/__init__.py +3 -0
  53. usdm4_protocol/m11/elements/elements.py +479 -0
  54. usdm4_protocol/m11/export/__init__.py +0 -0
  55. usdm4_protocol/m11/export/m11_export.py +88 -0
  56. usdm4_protocol/m11/import_/__init__.py +0 -0
  57. usdm4_protocol/m11/import_/extract/__init__.py +46 -0
  58. usdm4_protocol/m11/import_/extract/amendments.py +381 -0
  59. usdm4_protocol/m11/import_/extract/document.py +86 -0
  60. usdm4_protocol/m11/import_/extract/inclusion_exclusion.py +155 -0
  61. usdm4_protocol/m11/import_/extract/title_page.py +708 -0
  62. usdm4_protocol/m11/import_/extract/utility.py +10 -0
  63. usdm4_protocol/m11/import_/load/__init__.py +2 -0
  64. usdm4_protocol/m11/import_/m11_import.py +20 -0
  65. usdm4_protocol/m11/specification/__init__.py +46 -0
  66. usdm4_protocol/m11/specification/files.py +30 -0
  67. usdm4_protocol/m11/specification/section.py +54 -0
  68. usdm4_protocol/m11/specification/sections.py +24 -0
  69. usdm4_protocol/m11/views/__init__.py +0 -0
  70. usdm4_protocol/m11/views/data_view.py +37 -0
  71. usdm4_protocol/m11/views/document_view.py +20 -0
  72. usdm4_protocol/soa/__init__.py +1 -0
  73. usdm4_protocol/soa/decode_soa.py +197 -0
  74. usdm4_protocol/soa/features/__init__.py +1 -0
  75. usdm4_protocol/soa/features/activities.py +167 -0
  76. usdm4_protocol/soa/features/activity_row.py +80 -0
  77. usdm4_protocol/soa/features/conditions.py +49 -0
  78. usdm4_protocol/soa/features/epochs.py +241 -0
  79. usdm4_protocol/soa/features/notes.py +171 -0
  80. usdm4_protocol/soa/features/row_classifier.py +318 -0
  81. usdm4_protocol/soa/features/timepoints.py +475 -0
  82. usdm4_protocol/soa/features/utility.py +47 -0
  83. usdm4_protocol/soa/features/visits.py +222 -0
  84. usdm4_protocol/soa/features/windows.py +293 -0
  85. usdm4_protocol/soa/soa_extractor.py +103 -0
  86. usdm4_protocol/soa/soa_model.py +236 -0
  87. usdm4_protocol-0.1.0.dist-info/METADATA +286 -0
  88. usdm4_protocol-0.1.0.dist-info/RECORD +91 -0
  89. usdm4_protocol-0.1.0.dist-info/WHEEL +5 -0
  90. usdm4_protocol-0.1.0.dist-info/licenses/LICENSE +661 -0
  91. usdm4_protocol-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3 @@
1
+ __package_version__ = "0.1.0"
2
+ __model_version__ = "4.0.0"
3
+ __system_name__ = "USDM4 Protocol Package"
@@ -0,0 +1,104 @@
1
+ from usdm4_protocol.__info__ import (
2
+ __package_version__ as __package_version__,
3
+ __model_version__ as __model_version__,
4
+ __system_name__ as __system_name__,
5
+ )
6
+ from usdm4_protocol.m11 import USDM4M11
7
+ from usdm4_protocol.cpt import USDM4CPT
8
+ from usdm4_protocol.legacy import USDM4Legacy
9
+ from simple_error_log.errors import Errors
10
+
11
+
12
+ class USDM4Protocol:
13
+ """Unified entry point for importing clinical trial protocols into USDM4.
14
+
15
+ Supports three protocol formats:
16
+ - M11 (ICH M11 template, DOCX)
17
+ - CPT (TransCelerate Common Protocol Template, DOCX)
18
+ - Legacy (unknown sponsor format, PDF)
19
+ """
20
+
21
+ def __init__(self):
22
+ self._errors = Errors()
23
+ self._handler = None
24
+
25
+ def from_m11(self, filepath: str, use_ai: bool = False):
26
+ """Import an M11-formatted protocol (DOCX)."""
27
+ self._handler = USDM4M11()
28
+ return self._handler.from_docx(filepath, use_ai=use_ai)
29
+
30
+ def from_cpt(self, filepath: str):
31
+ """Import a CPT-formatted protocol (DOCX)."""
32
+ self._handler = USDM4CPT()
33
+ return self._handler.from_docx(filepath)
34
+
35
+ def from_pdf(self, filepath: str, pdf_converter: str = "auto"):
36
+ """Import a legacy PDF protocol.
37
+
38
+ Args:
39
+ filepath: Path to the PDF file.
40
+ pdf_converter: Which converter to use.
41
+ - "auto": Use docling if available, otherwise pymupdf.
42
+ - "docling": Use docling (pip install usdm4_protocol[pdf-docling]).
43
+ - "pymupdf": Use pymupdf (pip install usdm4_protocol[pdf]).
44
+ """
45
+ self._handler = USDM4Legacy()
46
+ return self._handler.from_pdf(filepath, pdf_converter=pdf_converter)
47
+
48
+ def from_file(self, filepath: str, use_ai: bool = False):
49
+ """Import a protocol, detecting format from the file extension and content.
50
+
51
+ For .pdf files, uses the Legacy handler.
52
+ For .docx files, attempts M11 first (if use_ai is requested or M11 markers
53
+ are detected), otherwise falls back to CPT.
54
+ """
55
+ lower = filepath.lower()
56
+ if lower.endswith(".pdf"):
57
+ return self.from_pdf(filepath, pdf_converter="auto")
58
+ elif lower.endswith(".docx"):
59
+ if use_ai:
60
+ return self.from_m11(filepath, use_ai=True)
61
+ return self.from_cpt(filepath)
62
+ else:
63
+ self._errors.error(f"Unsupported file format: {filepath}")
64
+ return None
65
+
66
+ def to_html(self, file_path: str, template: str = "M11") -> str | None:
67
+ """Export USDM4 data to HTML.
68
+
69
+ Args:
70
+ file_path: Path to USDM4 JSON file.
71
+ template: Template to use ("M11" or "CPT").
72
+ """
73
+ if template.upper() == "M11":
74
+ handler = USDM4M11()
75
+ return handler.to_html(file_path)
76
+ elif template.upper() == "CPT":
77
+ handler = USDM4CPT()
78
+ return handler.to_html(file_path)
79
+ else:
80
+ self._errors.error(f"Unsupported template: {template}")
81
+ return None
82
+
83
+ def data_views(self, file_path: str) -> dict:
84
+ """Generate data views from USDM4 data (M11 format)."""
85
+ handler = USDM4M11()
86
+ return handler.data_views(file_path)
87
+
88
+ @property
89
+ def source(self) -> dict:
90
+ if self._handler:
91
+ return self._handler.source
92
+ return {}
93
+
94
+ @property
95
+ def source_no_sections(self) -> dict:
96
+ if self._handler:
97
+ return self._handler.source_no_sections
98
+ return {}
99
+
100
+ @property
101
+ def errors(self):
102
+ if self._handler:
103
+ return self._handler.errors
104
+ return self._errors
@@ -0,0 +1 @@
1
+ # Common utilities for all packages
@@ -0,0 +1 @@
1
+ # AI providers
@@ -0,0 +1,53 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class BaseAIProvider(ABC):
5
+ """Abstract base class for AI providers."""
6
+
7
+ @property
8
+ @abstractmethod
9
+ def available(self) -> bool:
10
+ """Check if the AI provider is available."""
11
+ pass
12
+
13
+ @abstractmethod
14
+ def prompt(self, text: str, system: str = "") -> str:
15
+ """
16
+ Send a prompt to the AI model and get a response.
17
+
18
+ Args:
19
+ text: The prompt text
20
+ system: Optional system message for context
21
+
22
+ Returns:
23
+ The response text from the AI model, or None if unavailable
24
+ """
25
+ pass
26
+
27
+ @abstractmethod
28
+ def streaming_prompt(self, text: str, system_message: str = "") -> str:
29
+ """
30
+ Send a prompt to the AI model with streaming response.
31
+
32
+ Args:
33
+ text: The prompt text
34
+ system_message: Optional system message for context
35
+
36
+ Returns:
37
+ The full response text from the AI model, or None if unavailable
38
+ """
39
+ pass
40
+
41
+ @abstractmethod
42
+ def extract_json(self, text: str, dict: bool = True) -> dict | list | None:
43
+ """
44
+ Extract JSON from AI response text.
45
+
46
+ Args:
47
+ text: The response text containing JSON
48
+ dict: If True, extract dictionary; if False, extract list
49
+
50
+ Returns:
51
+ Parsed JSON object/list, or None if extraction failed
52
+ """
53
+ pass
@@ -0,0 +1,175 @@
1
+ import json
2
+ from simple_error_log.errors import Errors
3
+ from simple_error_log.error_location import KlassMethodLocation
4
+ from usdm4_protocol.common.ai.base_ai import BaseAIProvider
5
+
6
+ try:
7
+ from anthropic import Anthropic
8
+ from d4k_ms_base.service_environment import ServiceEnvironment
9
+
10
+ ANTHROPIC_AVAILABLE = True
11
+ except ImportError:
12
+ ANTHROPIC_AVAILABLE = False
13
+
14
+
15
+ class ClaudeProvider(BaseAIProvider):
16
+ """Claude AI provider using Anthropic API."""
17
+
18
+ MODULE = "usdm4_protocol.common.ai.claude_provider.ClaudeProvider"
19
+ DEFAULT_MODEL = "claude-haiku-4-5-20251001"
20
+ MODEL_PRICING = {DEFAULT_MODEL: {"input": 15.0, "output": 75.0}}
21
+
22
+ def __init__(self, errors: Errors, model: str = None):
23
+ """
24
+ Initialize the Claude provider.
25
+
26
+ Args:
27
+ errors: Error logging object
28
+ model: Model name (defaults to haiku)
29
+ """
30
+ self._errors = errors
31
+ self._model = model or self.DEFAULT_MODEL
32
+ self._client = None
33
+ self._available = False
34
+
35
+ if not ANTHROPIC_AVAILABLE:
36
+ errors.warning(
37
+ "Anthropic package not installed. Install with 'pip install anthropic d4k_ms_base'",
38
+ KlassMethodLocation(self.MODULE, "__init__"),
39
+ )
40
+ return
41
+
42
+ try:
43
+ api_key = ServiceEnvironment().get("ANTHROPIC_API_KEY")
44
+ if not api_key:
45
+ errors.error(
46
+ "Anthropic API key environment variable is not set",
47
+ KlassMethodLocation(self.MODULE, "__init__"),
48
+ )
49
+ else:
50
+ self._client = Anthropic(api_key=api_key)
51
+ self._available = True
52
+ except Exception as e:
53
+ errors.exception(
54
+ "Failed to initialize Anthropic client",
55
+ e,
56
+ KlassMethodLocation(self.MODULE, "__init__"),
57
+ )
58
+
59
+ @property
60
+ def available(self) -> bool:
61
+ """Check if the Claude provider is available."""
62
+ return self._available
63
+
64
+ def prompt(self, text: str, system: str = "") -> str:
65
+ """Send a prompt to Claude and get a response."""
66
+ if not self._client:
67
+ self._errors.error(
68
+ "No client object found",
69
+ KlassMethodLocation(self.MODULE, "prompt"),
70
+ )
71
+ return None
72
+
73
+ try:
74
+ message = self._client.messages.create(
75
+ max_tokens=1024,
76
+ system=system if system else "",
77
+ messages=[
78
+ {
79
+ "role": "user",
80
+ "content": text,
81
+ }
82
+ ],
83
+ model=self._model,
84
+ )
85
+ return message.content[0].text
86
+ except Exception as e:
87
+ self._errors.exception(
88
+ "Error executing prompt",
89
+ e,
90
+ KlassMethodLocation(self.MODULE, "prompt"),
91
+ )
92
+ return None
93
+
94
+ def streaming_prompt(self, text: str, system_message: str = "") -> str:
95
+ """Send a prompt to Claude with streaming response."""
96
+ if not self._client:
97
+ self._errors.error(
98
+ "No client object found",
99
+ KlassMethodLocation(self.MODULE, "streaming_prompt"),
100
+ )
101
+ return None
102
+
103
+ try:
104
+ with self._client.messages.stream(
105
+ model=self._model,
106
+ max_tokens=16384,
107
+ temperature=0,
108
+ system=system_message if system_message else "",
109
+ messages=[
110
+ {
111
+ "role": "user",
112
+ "content": text,
113
+ }
114
+ ],
115
+ ) as stream:
116
+ return self._streaming_response(stream)
117
+ except Exception as e:
118
+ self._errors.exception(
119
+ "Error executing streaming prompt",
120
+ e,
121
+ KlassMethodLocation(self.MODULE, "streaming_prompt"),
122
+ )
123
+ return None
124
+
125
+ def extract_json(self, text: str, dict: bool = True) -> dict | list | None:
126
+ """Extract JSON from response text."""
127
+ if not text:
128
+ self._errors.error(
129
+ "Error decoding Claude response - empty text",
130
+ KlassMethodLocation(self.MODULE, "extract_json"),
131
+ )
132
+ return None
133
+
134
+ try:
135
+ result = text.replace("\n", "")
136
+ if dict:
137
+ s_index = result.find("{")
138
+ e_index = result.rfind("}")
139
+ else:
140
+ s_index = result.find("[")
141
+ e_index = result.rfind("]")
142
+
143
+ if s_index >= 0 and e_index >= 0 and e_index > s_index:
144
+ result = result[s_index : e_index + 1]
145
+ return json.loads(result)
146
+ else:
147
+ self._errors.error(
148
+ "Error decoding Claude response - no JSON found",
149
+ KlassMethodLocation(self.MODULE, "extract_json"),
150
+ )
151
+ return None
152
+ except Exception as e:
153
+ self._errors.exception(
154
+ "Error decoding Claude JSON",
155
+ e,
156
+ KlassMethodLocation(self.MODULE, "extract_json"),
157
+ )
158
+ return None
159
+
160
+ def _streaming_response(self, stream) -> str:
161
+ """Process streamed response from Claude."""
162
+ full_response = ""
163
+ try:
164
+ for chunk in stream:
165
+ if hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
166
+ content = chunk.delta.text
167
+ full_response += content
168
+ except Exception as e:
169
+ self._errors.exception(
170
+ "Error decoding Claude stream",
171
+ e,
172
+ KlassMethodLocation(self.MODULE, "_streaming_response"),
173
+ )
174
+ return None
175
+ return full_response
@@ -0,0 +1,34 @@
1
+ from simple_error_log.errors import Errors
2
+ from usdm4_protocol.common.ai.base_ai import BaseAIProvider
3
+
4
+
5
+ class FallbackProvider(BaseAIProvider):
6
+ """Fallback provider that returns None for all operations."""
7
+
8
+ MODULE = "usdm4_protocol.common.ai.fallback_provider.FallbackProvider"
9
+
10
+ def __init__(self, errors: Errors):
11
+ """
12
+ Initialize the fallback provider.
13
+
14
+ Args:
15
+ errors: Error logging object
16
+ """
17
+ self._errors = errors
18
+
19
+ @property
20
+ def available(self) -> bool:
21
+ """Fallback provider is never available."""
22
+ return False
23
+
24
+ def prompt(self, text: str, system: str = "") -> None:
25
+ """Fallback always returns None."""
26
+ return None
27
+
28
+ def streaming_prompt(self, text: str, system_message: str = "") -> None:
29
+ """Fallback always returns None."""
30
+ return None
31
+
32
+ def extract_json(self, text: str, dict: bool = True) -> None:
33
+ """Fallback always returns None."""
34
+ return None
@@ -0,0 +1 @@
1
+ # Assembly utilities
@@ -0,0 +1,52 @@
1
+ from usdm4 import USDM4
2
+ from simple_error_log.errors import Errors
3
+ from simple_error_log.error_location import KlassMethodLocation
4
+ from usdm4.assembler.assembler import Assembler
5
+ from usdm4.api.wrapper import Wrapper
6
+ from usdm4_protocol.__info__ import (
7
+ __package_version__ as system_version,
8
+ __system_name__ as system_name,
9
+ )
10
+
11
+
12
+ class AssembleUSDM:
13
+ """Unified USDM assembler for all source formats."""
14
+
15
+ MODULE = "usdm4_protocol.common.assemble.assemble_usdm.AssembleUSDM"
16
+
17
+ def __init__(self, source_data: dict, errors: Errors):
18
+ """
19
+ Initialize the USDM assembler.
20
+
21
+ Args:
22
+ source_data: Source data dictionary to be assembled into USDM4
23
+ errors: Error logging object
24
+ """
25
+ self._source_data = source_data
26
+ self._errors = errors
27
+ self._usdm4 = USDM4()
28
+ self._assembler: Assembler = self._usdm4.assembler(self._errors)
29
+
30
+ def process(self) -> Wrapper:
31
+ """
32
+ Process the source data and assemble into USDM4.
33
+
34
+ Returns:
35
+ USDM4 wrapper object or empty dict if assembly fails
36
+ """
37
+ try:
38
+ self._assembler.execute(self._source_data)
39
+ return self._assembler.wrapper(system_name, system_version)
40
+ except Exception as e:
41
+ location = KlassMethodLocation(self.MODULE, "process")
42
+ self._errors.exception(
43
+ "Exception raised assembling USDM",
44
+ e,
45
+ location,
46
+ )
47
+ return {}
48
+
49
+ @property
50
+ def source(self):
51
+ """Get the source data."""
52
+ return self._source_data
@@ -0,0 +1,60 @@
1
+ import copy
2
+ from abc import ABC, abstractmethod
3
+ from simple_error_log.errors import Errors
4
+ from simple_error_log.error_location import KlassMethodLocation
5
+ from usdm4_protocol.common.assemble.assemble_usdm import AssembleUSDM
6
+
7
+
8
+ class BaseImport(ABC):
9
+ """Base class for format-specific import handlers.
10
+
11
+ Provides the shared Load -> Extract -> Assemble -> Wrapper pipeline and
12
+ common properties (source, source_no_sections, extra). Subclasses must
13
+ implement _load() and _extract() to supply format-specific behaviour.
14
+ """
15
+
16
+ MODULE = "usdm4_protocol.common.base_import.BaseImport"
17
+
18
+ def __init__(self, file_path: str, errors: Errors):
19
+ self._file_path = file_path
20
+ self._errors = errors
21
+ self._study = None
22
+
23
+ def process(self):
24
+ try:
25
+ loaded = self._load()
26
+ self._study = self._extract(loaded)
27
+ assembler = AssembleUSDM(self._study, self._errors)
28
+ wrapper = assembler.process()
29
+ return wrapper
30
+ except Exception as e:
31
+ location = KlassMethodLocation(self.MODULE, "process")
32
+ self._errors.exception(
33
+ f"Exception raised processing '{self._file_path}'",
34
+ e,
35
+ location,
36
+ )
37
+ return None
38
+
39
+ @abstractmethod
40
+ def _load(self):
41
+ """Load the source file. Returns format-specific data."""
42
+
43
+ @abstractmethod
44
+ def _extract(self, loaded) -> dict:
45
+ """Extract study data from the loaded source. Returns study dict."""
46
+
47
+ @property
48
+ def source(self) -> dict:
49
+ return self._study
50
+
51
+ @property
52
+ def source_no_sections(self) -> dict:
53
+ the_copy = copy.deepcopy(self._study)
54
+ the_copy["document"]["sections"] = []
55
+ return the_copy
56
+
57
+ @property
58
+ def extra(self) -> dict:
59
+ """Override in subclasses that need extra metadata."""
60
+ return {}
@@ -0,0 +1 @@
1
+ # Extract utilities