cs-models 0.0.827__py3-none-any.whl → 0.0.847__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cs_models/resources/CompanyOUS/models.py +2 -0
- cs_models/resources/CompanyOUS/schemas.py +4 -0
- cs_models/resources/CompanySEC/models.py +2 -0
- cs_models/resources/CompanySEC/schemas.py +4 -0
- cs_models/resources/DeepResearchAgenticUnit/__init__.py +14 -0
- cs_models/resources/DeepResearchAgenticUnit/models.py +123 -0
- cs_models/resources/DeepResearchAgenticUnit/schemas.py +50 -0
- cs_models/resources/DeepResearchSession/__init__.py +20 -0
- cs_models/resources/DeepResearchSession/models.py +170 -0
- cs_models/resources/DeepResearchSession/schemas.py +94 -0
- cs_models/resources/DeepResearchSubTask/__init__.py +20 -0
- cs_models/resources/DeepResearchSubTask/models.py +177 -0
- cs_models/resources/DeepResearchSubTask/schemas.py +105 -0
- cs_models/resources/MeetingUserDocument/__init__.py +0 -0
- cs_models/resources/MeetingUserDocument/models.py +39 -0
- cs_models/resources/MeetingUserDocument/schemas.py +17 -0
- cs_models/resources/PipelineCrawlSession/__init__.py +0 -0
- cs_models/resources/PipelineCrawlSession/models.py +67 -0
- cs_models/resources/PipelineCrawlSession/schemas.py +22 -0
- cs_models/resources/PipelineCrawledPage/__init__.py +0 -0
- cs_models/resources/PipelineCrawledPage/models.py +80 -0
- cs_models/resources/PipelineCrawledPage/schemas.py +34 -0
- cs_models/resources/PipelineDrugPortfolio/__init__.py +0 -0
- cs_models/resources/PipelineDrugPortfolio/models.py +92 -0
- cs_models/resources/PipelineDrugPortfolio/schemas.py +31 -0
- cs_models/resources/PipelineExtractionLog/__init__.py +0 -0
- cs_models/resources/PipelineExtractionLog/models.py +55 -0
- cs_models/resources/PipelineExtractionLog/schemas.py +23 -0
- cs_models/resources/PubmedMeetingSellSideSignal/__init__.py +0 -0
- cs_models/resources/PubmedMeetingSellSideSignal/models.py +64 -0
- cs_models/resources/PubmedMeetingSellSideSignal/schemas.py +21 -0
- cs_models/resources/PubmedMeetingUserDocument/__init__.py +0 -0
- cs_models/resources/PubmedMeetingUserDocument/models.py +40 -0
- cs_models/resources/PubmedMeetingUserDocument/schemas.py +16 -0
- cs_models/resources/SellSideAbstractMention/__init__.py +0 -0
- cs_models/resources/SellSideAbstractMention/models.py +57 -0
- cs_models/resources/SellSideAbstractMention/schemas.py +28 -0
- cs_models/resources/SellSideAbstractMentionLink/__init__.py +0 -0
- cs_models/resources/SellSideAbstractMentionLink/models.py +60 -0
- cs_models/resources/SellSideAbstractMentionLink/schemas.py +24 -0
- cs_models/resources/SellSideSource/__init__.py +0 -0
- cs_models/resources/SellSideSource/models.py +25 -0
- cs_models/resources/SellSideSource/schemas.py +13 -0
- cs_models/resources/UserDocument/models.py +7 -0
- cs_models/resources/UserDocument/schemas.py +2 -0
- cs_models/resources/UserDocumentAccess/models.py +6 -0
- cs_models/resources/UserDocumentAccess/schemas.py +1 -0
- cs_models/resources/Workbook/models.py +9 -0
- cs_models/resources/Workbook/schemas.py +6 -0
- cs_models/resources/WorkbookCommentThread/__init__.py +0 -0
- cs_models/resources/WorkbookCommentThread/models.py +59 -0
- cs_models/resources/WorkbookCommentThread/schemas.py +35 -0
- cs_models/resources/WorkbookThreadComment/__init__.py +0 -0
- cs_models/resources/WorkbookThreadComment/models.py +38 -0
- cs_models/resources/WorkbookThreadComment/schemas.py +14 -0
- {cs_models-0.0.827.dist-info → cs_models-0.0.847.dist-info}/METADATA +1 -1
- {cs_models-0.0.827.dist-info → cs_models-0.0.847.dist-info}/RECORD +59 -14
- {cs_models-0.0.827.dist-info → cs_models-0.0.847.dist-info}/WHEEL +0 -0
- {cs_models-0.0.827.dist-info → cs_models-0.0.847.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Models for Deep Research SubTasks - Individual research agents."""
|
|
2
|
+
import enum
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import (
|
|
6
|
+
Column,
|
|
7
|
+
DateTime,
|
|
8
|
+
Integer,
|
|
9
|
+
String,
|
|
10
|
+
Text,
|
|
11
|
+
Boolean,
|
|
12
|
+
ForeignKey,
|
|
13
|
+
Enum,
|
|
14
|
+
Float,
|
|
15
|
+
)
|
|
16
|
+
from sqlalchemy.orm import relationship
|
|
17
|
+
|
|
18
|
+
from ...database import Base
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SubTaskStatusEnum(enum.Enum):
|
|
22
|
+
"""Status enum for deep research subtasks."""
|
|
23
|
+
PENDING = "PENDING"
|
|
24
|
+
QUEUED = "QUEUED"
|
|
25
|
+
RUNNING = "RUNNING"
|
|
26
|
+
COMPLETED = "COMPLETED"
|
|
27
|
+
FAILED = "FAILED"
|
|
28
|
+
SKIPPED = "SKIPPED"
|
|
29
|
+
EXPANDED = "EXPANDED" # Template task that has been expanded into child tasks
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SubTaskTypeEnum(enum.Enum):
|
|
33
|
+
"""Type enum for research dimensions and special task types."""
|
|
34
|
+
# Research dimensions
|
|
35
|
+
CUSTOM = "CUSTOM"
|
|
36
|
+
INVESTIGATION = "INVESTIGATION"
|
|
37
|
+
DISCOVERY = "DISCOVERY" # Entity discovery tasks
|
|
38
|
+
SMART_GRID = "SMART_GRID" # SmartGrid matrix research (items × dimensions)
|
|
39
|
+
AGGREGATION = "AGGREGATION" # Aggregation of other task results
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DeepResearchSubTaskModel(Base):
|
|
43
|
+
"""
|
|
44
|
+
Model for Deep Research SubTasks.
|
|
45
|
+
|
|
46
|
+
Each subtask represents an independent research agent that executes
|
|
47
|
+
with a fresh context window. Subtasks do not communicate with each
|
|
48
|
+
other - all coordination flows through the main orchestrator.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
__tablename__ = "deep_research_subtasks"
|
|
52
|
+
|
|
53
|
+
id = Column(Integer, primary_key=True)
|
|
54
|
+
|
|
55
|
+
# Parent session
|
|
56
|
+
session_id = Column(
|
|
57
|
+
Integer,
|
|
58
|
+
ForeignKey("deep_research_sessions.id"),
|
|
59
|
+
nullable=False,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# V2: Parent agentic unit (nullable for backward compatibility with v1)
|
|
63
|
+
agentic_unit_id = Column(
|
|
64
|
+
Integer,
|
|
65
|
+
ForeignKey("deep_research_agentic_units.id"),
|
|
66
|
+
nullable=True,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Task identification
|
|
70
|
+
task_id = Column(String(100), nullable=True) # Logical task ID (e.g., "t1", "t2_ide-cel")
|
|
71
|
+
sequence_number = Column(Integer, nullable=False, default=0)
|
|
72
|
+
task_type = Column(
|
|
73
|
+
"task_type",
|
|
74
|
+
Enum(SubTaskTypeEnum),
|
|
75
|
+
default=SubTaskTypeEnum.CUSTOM,
|
|
76
|
+
nullable=False,
|
|
77
|
+
)
|
|
78
|
+
task_label = Column(String(256), nullable=True)
|
|
79
|
+
|
|
80
|
+
# Template/Expansion tracking
|
|
81
|
+
is_template = Column(Boolean, nullable=True, default=False) # True if this is a template task
|
|
82
|
+
parent_task_id = Column(
|
|
83
|
+
Integer,
|
|
84
|
+
ForeignKey("deep_research_subtasks.id"),
|
|
85
|
+
nullable=True,
|
|
86
|
+
) # Links expanded tasks to their parent template
|
|
87
|
+
|
|
88
|
+
# Task specification (JSON)
|
|
89
|
+
specification = Column(Text, nullable=False) # JSON with focus_question, entities, etc.
|
|
90
|
+
focus_question = Column(Text, nullable=True) # Denormalized for quick access
|
|
91
|
+
entities = Column(Text, nullable=True) # JSON array of entity objects
|
|
92
|
+
search_scope = Column(Text, nullable=True) # JSON array: ["PUBLICATION", "CLINICAL_TRIAL"]
|
|
93
|
+
expected_output_format = Column(String(50), nullable=True) # structured, narrative, table_row
|
|
94
|
+
time_range_start = Column(DateTime, nullable=True)
|
|
95
|
+
time_range_end = Column(DateTime, nullable=True)
|
|
96
|
+
|
|
97
|
+
# Entity discovery (for DISCOVERY tasks and for_each_entity_from pattern)
|
|
98
|
+
output_entities = Column(Boolean, nullable=True, default=False) # True if this task outputs entities
|
|
99
|
+
entity_type = Column(String(50), nullable=True) # Type of entities discovered (e.g., "drug", "event")
|
|
100
|
+
for_each_entity_from = Column(String(100), nullable=True) # task_id of parent task to get entities from
|
|
101
|
+
entity_name = Column(String(256), nullable=True) # Entity name this expanded task was created for
|
|
102
|
+
entity_data = Column(Text, nullable=True) # JSON: Full entity info including synonyms
|
|
103
|
+
discovered_entities = Column(Text, nullable=True) # JSON: Entities discovered by this task
|
|
104
|
+
|
|
105
|
+
# Execution status
|
|
106
|
+
status = Column(
|
|
107
|
+
"status",
|
|
108
|
+
Enum(SubTaskStatusEnum),
|
|
109
|
+
default=SubTaskStatusEnum.PENDING,
|
|
110
|
+
nullable=False,
|
|
111
|
+
)
|
|
112
|
+
lambda_request_id = Column(String(256), nullable=True)
|
|
113
|
+
started_at = Column(DateTime, nullable=True)
|
|
114
|
+
completed_at = Column(DateTime, nullable=True)
|
|
115
|
+
|
|
116
|
+
# Dependencies (for sequential tasks)
|
|
117
|
+
depends_on = Column(Text, nullable=True) # JSON array of subtask IDs
|
|
118
|
+
priority = Column(Integer, nullable=True, default=1)
|
|
119
|
+
|
|
120
|
+
# Results
|
|
121
|
+
result_s3_key = Column(String(512), nullable=True)
|
|
122
|
+
result_summary = Column(Text, nullable=True) # Brief summary of findings
|
|
123
|
+
citations_count = Column(Integer, nullable=True, default=0)
|
|
124
|
+
documents_analyzed = Column(Integer, nullable=True, default=0)
|
|
125
|
+
|
|
126
|
+
# SmartGrid integration (for SMART_GRID task type)
|
|
127
|
+
smart_grid_id = Column(Integer, nullable=True) # FK to SmartGridModel (not enforced)
|
|
128
|
+
smart_grid_analysis_type = Column(String(50), nullable=True) # "row", "column", or "both"
|
|
129
|
+
|
|
130
|
+
# Quality metrics
|
|
131
|
+
confidence_score = Column(Float, nullable=True) # 0.0 - 1.0
|
|
132
|
+
relevance_score = Column(Float, nullable=True) # 0.0 - 1.0
|
|
133
|
+
coverage_score = Column(Float, nullable=True) # How well the question was answered
|
|
134
|
+
|
|
135
|
+
# Execution metrics
|
|
136
|
+
execution_time_ms = Column(Integer, nullable=True)
|
|
137
|
+
tokens_used = Column(Integer, nullable=True)
|
|
138
|
+
search_queries_count = Column(Integer, nullable=True)
|
|
139
|
+
|
|
140
|
+
# Error handling
|
|
141
|
+
error_message = Column(Text, nullable=True)
|
|
142
|
+
error_type = Column(String(100), nullable=True)
|
|
143
|
+
retry_count = Column(Integer, nullable=True, default=0)
|
|
144
|
+
|
|
145
|
+
# Soft delete and timestamps
|
|
146
|
+
is_deleted = Column(Boolean, nullable=True)
|
|
147
|
+
created_at = Column(
|
|
148
|
+
DateTime,
|
|
149
|
+
nullable=False,
|
|
150
|
+
default=lambda: datetime.utcnow(),
|
|
151
|
+
)
|
|
152
|
+
updated_at = Column(
|
|
153
|
+
DateTime,
|
|
154
|
+
nullable=False,
|
|
155
|
+
default=lambda: datetime.utcnow(),
|
|
156
|
+
onupdate=lambda: datetime.utcnow(),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# ORM Relationships
|
|
160
|
+
session = relationship(
|
|
161
|
+
"DeepResearchSessionModel",
|
|
162
|
+
back_populates="subtasks",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Self-referential relationship for template → expanded tasks
|
|
166
|
+
parent_task = relationship(
|
|
167
|
+
"DeepResearchSubTaskModel",
|
|
168
|
+
remote_side=[id],
|
|
169
|
+
backref="expanded_tasks",
|
|
170
|
+
foreign_keys=[parent_task_id],
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# V2: Relationship to parent agentic unit
|
|
174
|
+
agentic_unit = relationship(
|
|
175
|
+
"DeepResearchAgenticUnitModel",
|
|
176
|
+
back_populates="internal_tasks",
|
|
177
|
+
)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Schemas for Deep Research SubTasks."""
|
|
2
|
+
from marshmallow import Schema, fields, validate
|
|
3
|
+
|
|
4
|
+
from .models import SubTaskStatusEnum, SubTaskTypeEnum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DeepResearchSubTaskResourceSchema(Schema):
|
|
8
|
+
"""Schema for DeepResearchSubTaskModel."""
|
|
9
|
+
|
|
10
|
+
not_blank = validate.Length(min=1, error="Field cannot be blank")
|
|
11
|
+
|
|
12
|
+
id = fields.Integer(dump_only=True)
|
|
13
|
+
session_id = fields.Integer(required=True)
|
|
14
|
+
|
|
15
|
+
# Task identification
|
|
16
|
+
task_id = fields.String(allow_none=True) # Logical task ID (e.g., "t1", "t2_ide-cel")
|
|
17
|
+
sequence_number = fields.Integer(required=True)
|
|
18
|
+
task_type = fields.Enum(SubTaskTypeEnum, by_value=True)
|
|
19
|
+
task_label = fields.String(allow_none=True)
|
|
20
|
+
|
|
21
|
+
# Template/Expansion tracking
|
|
22
|
+
is_template = fields.Boolean(allow_none=True)
|
|
23
|
+
parent_task_id = fields.Integer(allow_none=True)
|
|
24
|
+
|
|
25
|
+
# Task specification
|
|
26
|
+
specification = fields.String(required=True)
|
|
27
|
+
focus_question = fields.String(allow_none=True)
|
|
28
|
+
entities = fields.String(allow_none=True)
|
|
29
|
+
search_scope = fields.String(allow_none=True)
|
|
30
|
+
expected_output_format = fields.String(allow_none=True)
|
|
31
|
+
time_range_start = fields.DateTime(allow_none=True)
|
|
32
|
+
time_range_end = fields.DateTime(allow_none=True)
|
|
33
|
+
|
|
34
|
+
# Entity discovery fields
|
|
35
|
+
output_entities = fields.Boolean(allow_none=True)
|
|
36
|
+
entity_type = fields.String(allow_none=True)
|
|
37
|
+
for_each_entity_from = fields.String(allow_none=True)
|
|
38
|
+
entity_name = fields.String(allow_none=True)
|
|
39
|
+
entity_data = fields.String(allow_none=True)
|
|
40
|
+
discovered_entities = fields.String(allow_none=True)
|
|
41
|
+
|
|
42
|
+
# Execution status
|
|
43
|
+
status = fields.Enum(SubTaskStatusEnum, by_value=True)
|
|
44
|
+
lambda_request_id = fields.String(allow_none=True)
|
|
45
|
+
started_at = fields.DateTime(allow_none=True)
|
|
46
|
+
completed_at = fields.DateTime(allow_none=True)
|
|
47
|
+
|
|
48
|
+
# Dependencies
|
|
49
|
+
depends_on = fields.String(allow_none=True)
|
|
50
|
+
priority = fields.Integer(allow_none=True)
|
|
51
|
+
|
|
52
|
+
# Results
|
|
53
|
+
result_s3_key = fields.String(allow_none=True)
|
|
54
|
+
result_summary = fields.String(allow_none=True)
|
|
55
|
+
citations_count = fields.Integer(allow_none=True)
|
|
56
|
+
documents_analyzed = fields.Integer(allow_none=True)
|
|
57
|
+
|
|
58
|
+
# Smart Grid
|
|
59
|
+
smart_grid_id = fields.Integer(allow_none=True)
|
|
60
|
+
smart_grid_analysis_type = fields.String(allow_none=True)
|
|
61
|
+
|
|
62
|
+
# Quality metrics
|
|
63
|
+
confidence_score = fields.Float(allow_none=True)
|
|
64
|
+
relevance_score = fields.Float(allow_none=True)
|
|
65
|
+
coverage_score = fields.Float(allow_none=True)
|
|
66
|
+
|
|
67
|
+
# Execution metrics
|
|
68
|
+
execution_time_ms = fields.Integer(allow_none=True)
|
|
69
|
+
tokens_used = fields.Integer(allow_none=True)
|
|
70
|
+
search_queries_count = fields.Integer(allow_none=True)
|
|
71
|
+
|
|
72
|
+
# Error handling
|
|
73
|
+
error_message = fields.String(allow_none=True)
|
|
74
|
+
error_type = fields.String(allow_none=True)
|
|
75
|
+
retry_count = fields.Integer(allow_none=True)
|
|
76
|
+
|
|
77
|
+
# Timestamps
|
|
78
|
+
is_deleted = fields.Boolean(allow_none=True)
|
|
79
|
+
created_at = fields.DateTime(dump_only=True)
|
|
80
|
+
updated_at = fields.DateTime(dump_only=True)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class DeepResearchSubTaskSpecSchema(Schema):
|
|
84
|
+
"""Schema for subtask specification object."""
|
|
85
|
+
|
|
86
|
+
focus_question = fields.String(required=True)
|
|
87
|
+
entities = fields.List(fields.Dict(), allow_none=True)
|
|
88
|
+
search_scope = fields.List(fields.String(), allow_none=True)
|
|
89
|
+
output_format = fields.String(allow_none=True)
|
|
90
|
+
expected_fields = fields.List(fields.String(), allow_none=True)
|
|
91
|
+
max_documents = fields.Integer(allow_none=True)
|
|
92
|
+
time_range = fields.List(fields.String(), allow_none=True)
|
|
93
|
+
retrieval_hints = fields.Dict(allow_none=True)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class DeepResearchSubTaskCreateSchema(Schema):
|
|
97
|
+
"""Schema for creating a subtask."""
|
|
98
|
+
|
|
99
|
+
session_id = fields.Integer(required=True)
|
|
100
|
+
sequence_number = fields.Integer(required=True)
|
|
101
|
+
task_type = fields.Enum(SubTaskTypeEnum, by_value=True, required=True)
|
|
102
|
+
task_label = fields.String(allow_none=True)
|
|
103
|
+
specification = fields.Nested(DeepResearchSubTaskSpecSchema, required=True)
|
|
104
|
+
depends_on = fields.List(fields.Integer(), allow_none=True)
|
|
105
|
+
priority = fields.Integer(allow_none=True)
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import (
|
|
4
|
+
Column,
|
|
5
|
+
Text,
|
|
6
|
+
Integer,
|
|
7
|
+
String,
|
|
8
|
+
Boolean,
|
|
9
|
+
DateTime,
|
|
10
|
+
ForeignKey,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from ...database import Base
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MeetingUserDocumentModel(Base):
|
|
17
|
+
__tablename__ = "meeting_user_documents"
|
|
18
|
+
|
|
19
|
+
id = Column(Integer, primary_key=True)
|
|
20
|
+
meeting_id = Column(
|
|
21
|
+
Integer,
|
|
22
|
+
ForeignKey('meetings.id'),
|
|
23
|
+
nullable=False,
|
|
24
|
+
)
|
|
25
|
+
user_document_id = Column(
|
|
26
|
+
Integer,
|
|
27
|
+
ForeignKey('user_documents.id'),
|
|
28
|
+
nullable=False,
|
|
29
|
+
)
|
|
30
|
+
status = Column(String(50), nullable=False)
|
|
31
|
+
is_active = Column(Boolean, nullable=True)
|
|
32
|
+
details = Column(Text, nullable=True)
|
|
33
|
+
updated_at = Column(
|
|
34
|
+
DateTime,
|
|
35
|
+
nullable=False,
|
|
36
|
+
# https://stackoverflow.com/questions/58776476/why-doesnt-freezegun-work-with-sqlalchemy-default-values
|
|
37
|
+
default=lambda: datetime.utcnow(),
|
|
38
|
+
onupdate=lambda: datetime.utcnow(),
|
|
39
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from marshmallow import (
|
|
2
|
+
Schema,
|
|
3
|
+
fields,
|
|
4
|
+
validate,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MeetingUserDocumentResourceSchema(Schema):
|
|
9
|
+
not_blank = validate.Length(min=1, error='Field cannot be blank')
|
|
10
|
+
|
|
11
|
+
id = fields.Integer(dump_only=True)
|
|
12
|
+
meeting_id = fields.Integer(required=True)
|
|
13
|
+
user_document_id = fields.Integer(required=True)
|
|
14
|
+
status = fields.String(required=True)
|
|
15
|
+
is_active = fields.Boolean(allow_none=True)
|
|
16
|
+
details = fields.String(allow_none=True)
|
|
17
|
+
updated_at = fields.DateTime()
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from sqlalchemy import (
|
|
2
|
+
Column,
|
|
3
|
+
Integer,
|
|
4
|
+
String,
|
|
5
|
+
DateTime,
|
|
6
|
+
Float,
|
|
7
|
+
Text,
|
|
8
|
+
ForeignKey,
|
|
9
|
+
Enum,
|
|
10
|
+
)
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
|
|
13
|
+
from ...database import Base
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PipelineCrawlSessionModel(Base):
|
|
17
|
+
__tablename__ = 'pipeline_crawl_sessions'
|
|
18
|
+
|
|
19
|
+
id = Column(Integer, primary_key=True)
|
|
20
|
+
company_sec_id = Column(
|
|
21
|
+
Integer,
|
|
22
|
+
ForeignKey('companies_sec.id'),
|
|
23
|
+
nullable=True,
|
|
24
|
+
index=True,
|
|
25
|
+
)
|
|
26
|
+
company_ous_id = Column(
|
|
27
|
+
Integer,
|
|
28
|
+
ForeignKey('companies_ous.id'),
|
|
29
|
+
nullable=True,
|
|
30
|
+
index=True,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Session tracking
|
|
34
|
+
session_uuid = Column(String(36), nullable=False, unique=True, index=True)
|
|
35
|
+
crawl_date = Column(DateTime, nullable=False, default=datetime.utcnow, index=True)
|
|
36
|
+
|
|
37
|
+
# Timing metrics
|
|
38
|
+
crawl_duration_seconds = Column(Float, nullable=True)
|
|
39
|
+
page_discovery_time_seconds = Column(Float, nullable=True)
|
|
40
|
+
extraction_time_seconds = Column(Float, nullable=True)
|
|
41
|
+
|
|
42
|
+
# Status tracking (MySQL ENUM)
|
|
43
|
+
status = Column(
|
|
44
|
+
Enum('started', 'discovering_pages', 'extracting_data', 'completed', 'failed', 'timeout'),
|
|
45
|
+
nullable=False,
|
|
46
|
+
default='started',
|
|
47
|
+
index=True,
|
|
48
|
+
)
|
|
49
|
+
error_message = Column(Text, nullable=True)
|
|
50
|
+
|
|
51
|
+
# Crawl statistics
|
|
52
|
+
total_pages_discovered = Column(Integer, nullable=True, default=0)
|
|
53
|
+
total_pages_crawled = Column(Integer, nullable=True, default=0)
|
|
54
|
+
total_screenshots_captured = Column(Integer, nullable=True, default=0)
|
|
55
|
+
total_drugs_extracted = Column(Integer, nullable=True, default=0)
|
|
56
|
+
|
|
57
|
+
# Crawl configuration
|
|
58
|
+
crawl_method = Column(String(50), nullable=True)
|
|
59
|
+
max_depth = Column(Integer, nullable=True, default=2)
|
|
60
|
+
|
|
61
|
+
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
|
62
|
+
updated_at = Column(
|
|
63
|
+
DateTime,
|
|
64
|
+
nullable=False,
|
|
65
|
+
default=datetime.utcnow,
|
|
66
|
+
onupdate=datetime.utcnow,
|
|
67
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from marshmallow import Schema, fields
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PipelineCrawlSessionSchema(Schema):
|
|
5
|
+
id = fields.Int(dump_only=True)
|
|
6
|
+
company_sec_id = fields.Int(allow_none=True)
|
|
7
|
+
company_ous_id = fields.Int(allow_none=True)
|
|
8
|
+
session_uuid = fields.Str(required=True)
|
|
9
|
+
crawl_date = fields.DateTime(required=True)
|
|
10
|
+
crawl_duration_seconds = fields.Float(allow_none=True)
|
|
11
|
+
page_discovery_time_seconds = fields.Float(allow_none=True)
|
|
12
|
+
extraction_time_seconds = fields.Float(allow_none=True)
|
|
13
|
+
status = fields.Str(required=True)
|
|
14
|
+
error_message = fields.Str(allow_none=True)
|
|
15
|
+
total_pages_discovered = fields.Int(allow_none=True)
|
|
16
|
+
total_pages_crawled = fields.Int(allow_none=True)
|
|
17
|
+
total_screenshots_captured = fields.Int(allow_none=True)
|
|
18
|
+
total_drugs_extracted = fields.Int(allow_none=True)
|
|
19
|
+
crawl_method = fields.Str(allow_none=True)
|
|
20
|
+
max_depth = fields.Int(allow_none=True)
|
|
21
|
+
created_at = fields.DateTime(dump_only=True)
|
|
22
|
+
updated_at = fields.DateTime(dump_only=True)
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from sqlalchemy import (
|
|
2
|
+
Column,
|
|
3
|
+
Integer,
|
|
4
|
+
String,
|
|
5
|
+
DateTime,
|
|
6
|
+
Float,
|
|
7
|
+
Boolean,
|
|
8
|
+
ForeignKey,
|
|
9
|
+
Enum,
|
|
10
|
+
Index,
|
|
11
|
+
UniqueConstraint,
|
|
12
|
+
)
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
|
|
15
|
+
from ...database import Base
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PipelineCrawledPageModel(Base):
|
|
19
|
+
__tablename__ = 'pipeline_crawled_pages'
|
|
20
|
+
|
|
21
|
+
id = Column(Integer, primary_key=True)
|
|
22
|
+
session_id = Column(
|
|
23
|
+
Integer,
|
|
24
|
+
ForeignKey('pipeline_crawl_sessions.id'),
|
|
25
|
+
nullable=False,
|
|
26
|
+
index=True,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# URL information
|
|
30
|
+
url = Column(String(1024), nullable=False)
|
|
31
|
+
url_hash = Column(String(64), nullable=False, index=True)
|
|
32
|
+
page_title = Column(String(512), nullable=True)
|
|
33
|
+
|
|
34
|
+
# Page content storage (S3)
|
|
35
|
+
html_content_s3_key = Column(String(512), nullable=True)
|
|
36
|
+
html_content_hash = Column(String(64), nullable=True)
|
|
37
|
+
html_content_length = Column(Integer, nullable=True)
|
|
38
|
+
|
|
39
|
+
# Screenshot storage (S3)
|
|
40
|
+
screenshot_s3_key = Column(String(512), nullable=True)
|
|
41
|
+
screenshot_hash = Column(String(64), nullable=True)
|
|
42
|
+
screenshot_width = Column(Integer, nullable=True)
|
|
43
|
+
screenshot_height = Column(Integer, nullable=True)
|
|
44
|
+
screenshot_file_size = Column(Integer, nullable=True)
|
|
45
|
+
|
|
46
|
+
# Page classification (MySQL ENUM)
|
|
47
|
+
page_type = Column(
|
|
48
|
+
Enum('pipeline', 'science', 'rd', 'products', 'clinical_trials', 'other'),
|
|
49
|
+
nullable=True,
|
|
50
|
+
)
|
|
51
|
+
relevance_score = Column(Float, nullable=True)
|
|
52
|
+
|
|
53
|
+
# Extraction metadata
|
|
54
|
+
has_drug_data = Column(Boolean, nullable=True, default=False)
|
|
55
|
+
extraction_method = Column(String(50), nullable=True)
|
|
56
|
+
|
|
57
|
+
# Crawl metadata
|
|
58
|
+
crawl_depth = Column(Integer, nullable=True, default=0)
|
|
59
|
+
parent_page_id = Column(Integer, ForeignKey('pipeline_crawled_pages.id'), nullable=True)
|
|
60
|
+
discovered_from_url = Column(String(1024), nullable=True)
|
|
61
|
+
|
|
62
|
+
# Technical metadata
|
|
63
|
+
http_status_code = Column(Integer, nullable=True)
|
|
64
|
+
content_type = Column(String(128), nullable=True)
|
|
65
|
+
response_time_ms = Column(Integer, nullable=True)
|
|
66
|
+
|
|
67
|
+
# Timestamps
|
|
68
|
+
crawled_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
|
69
|
+
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
|
70
|
+
updated_at = Column(
|
|
71
|
+
DateTime,
|
|
72
|
+
nullable=False,
|
|
73
|
+
default=datetime.utcnow,
|
|
74
|
+
onupdate=datetime.utcnow,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
__table_args__ = (
|
|
78
|
+
Index('idx_session_url_hash', 'session_id', 'url_hash'),
|
|
79
|
+
UniqueConstraint('session_id', 'url_hash', name='uq_session_url'),
|
|
80
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from marshmallow import Schema, fields
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PipelineCrawledPageSchema(Schema):
|
|
5
|
+
id = fields.Int(dump_only=True)
|
|
6
|
+
session_id = fields.Int(required=True)
|
|
7
|
+
url = fields.Str(required=True)
|
|
8
|
+
url_hash = fields.Str(required=True)
|
|
9
|
+
page_title = fields.Str(allow_none=True)
|
|
10
|
+
html_content_s3_key = fields.Str(allow_none=True)
|
|
11
|
+
html_content_hash = fields.Str(allow_none=True)
|
|
12
|
+
html_content_length = fields.Int(allow_none=True)
|
|
13
|
+
screenshot_s3_key = fields.Str(allow_none=True)
|
|
14
|
+
screenshot_hash = fields.Str(allow_none=True)
|
|
15
|
+
screenshot_width = fields.Int(allow_none=True)
|
|
16
|
+
screenshot_height = fields.Int(allow_none=True)
|
|
17
|
+
screenshot_file_size = fields.Int(allow_none=True)
|
|
18
|
+
page_type = fields.Str(allow_none=True)
|
|
19
|
+
relevance_score = fields.Float(allow_none=True)
|
|
20
|
+
has_drug_data = fields.Bool(allow_none=True)
|
|
21
|
+
extraction_method = fields.Str(allow_none=True)
|
|
22
|
+
crawl_depth = fields.Int(allow_none=True)
|
|
23
|
+
parent_page_id = fields.Int(allow_none=True)
|
|
24
|
+
discovered_from_url = fields.Str(allow_none=True)
|
|
25
|
+
http_status_code = fields.Int(allow_none=True)
|
|
26
|
+
content_type = fields.Str(allow_none=True)
|
|
27
|
+
response_time_ms = fields.Int(allow_none=True)
|
|
28
|
+
crawled_at = fields.DateTime(required=True)
|
|
29
|
+
created_at = fields.DateTime(dump_only=True)
|
|
30
|
+
updated_at = fields.DateTime(dump_only=True)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PipelineCrawledPageResourceSchema(PipelineCrawledPageSchema):
|
|
34
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from sqlalchemy import (
|
|
2
|
+
Column,
|
|
3
|
+
BIGINT,
|
|
4
|
+
Integer,
|
|
5
|
+
String,
|
|
6
|
+
DateTime,
|
|
7
|
+
Float,
|
|
8
|
+
Text,
|
|
9
|
+
Boolean,
|
|
10
|
+
ForeignKey,
|
|
11
|
+
Enum,
|
|
12
|
+
Index,
|
|
13
|
+
JSON,
|
|
14
|
+
)
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
|
|
17
|
+
from ...database import Base
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PipelineDrugPortfolioModel(Base):
|
|
21
|
+
__tablename__ = 'pipeline_drug_portfolio'
|
|
22
|
+
|
|
23
|
+
id = Column(BIGINT, primary_key=True)
|
|
24
|
+
session_id = Column(
|
|
25
|
+
Integer,
|
|
26
|
+
ForeignKey('pipeline_crawl_sessions.id'),
|
|
27
|
+
nullable=False,
|
|
28
|
+
index=True,
|
|
29
|
+
)
|
|
30
|
+
page_id = Column(
|
|
31
|
+
Integer,
|
|
32
|
+
ForeignKey('pipeline_crawled_pages.id'),
|
|
33
|
+
nullable=False,
|
|
34
|
+
index=True,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
company_sec_id = Column(
|
|
38
|
+
Integer,
|
|
39
|
+
ForeignKey('companies_sec.id'),
|
|
40
|
+
nullable=True,
|
|
41
|
+
index=True,
|
|
42
|
+
)
|
|
43
|
+
company_ous_id = Column(
|
|
44
|
+
Integer,
|
|
45
|
+
ForeignKey('companies_ous.id'),
|
|
46
|
+
nullable=True,
|
|
47
|
+
index=True,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Core drug information
|
|
51
|
+
drug_name = Column(String(512), nullable=False, index=True)
|
|
52
|
+
drug_name_normalized = Column(String(512), nullable=True, index=True)
|
|
53
|
+
synonyms = Column(JSON, nullable=True)
|
|
54
|
+
|
|
55
|
+
modality = Column(String(256), nullable=True)
|
|
56
|
+
targets = Column(JSON, nullable=True)
|
|
57
|
+
mechanism_of_action = Column(Text, nullable=True)
|
|
58
|
+
|
|
59
|
+
# Indications (JSON)
|
|
60
|
+
indications = Column(JSON, nullable=False)
|
|
61
|
+
|
|
62
|
+
# Additional information (JSON)
|
|
63
|
+
additional_info = Column(JSON, nullable=True)
|
|
64
|
+
|
|
65
|
+
# Extraction metadata (MySQL ENUM)
|
|
66
|
+
extraction_method = Column(
|
|
67
|
+
Enum('html_parsing', 'vision_model', 'hybrid', 'manual'),
|
|
68
|
+
nullable=False,
|
|
69
|
+
)
|
|
70
|
+
extraction_confidence = Column(Float, nullable=True)
|
|
71
|
+
raw_extraction_text = Column(Text, nullable=True)
|
|
72
|
+
|
|
73
|
+
# Data quality flags
|
|
74
|
+
is_verified = Column(Boolean, nullable=True, default=False)
|
|
75
|
+
is_duplicate = Column(Boolean, nullable=True, default=False, index=True)
|
|
76
|
+
needs_review = Column(Boolean, nullable=True, default=False, index=True)
|
|
77
|
+
review_notes = Column(Text, nullable=True)
|
|
78
|
+
|
|
79
|
+
# Timestamps
|
|
80
|
+
extracted_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
|
81
|
+
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
|
82
|
+
updated_at = Column(
|
|
83
|
+
DateTime,
|
|
84
|
+
nullable=False,
|
|
85
|
+
default=datetime.utcnow,
|
|
86
|
+
onupdate=datetime.utcnow,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
__table_args__ = (
|
|
90
|
+
Index('idx_company_drug_name', 'company_sec_id', 'drug_name_normalized'),
|
|
91
|
+
Index('idx_session_drug', 'session_id', 'drug_name_normalized'),
|
|
92
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from marshmallow import Schema, fields
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PipelineDrugPortfolioSchema(Schema):
|
|
5
|
+
id = fields.Int(dump_only=True)
|
|
6
|
+
session_id = fields.Int(required=True)
|
|
7
|
+
page_id = fields.Int(required=True)
|
|
8
|
+
company_sec_id = fields.Int(allow_none=True)
|
|
9
|
+
company_ous_id = fields.Int(allow_none=True)
|
|
10
|
+
drug_name = fields.Str(required=True)
|
|
11
|
+
drug_name_normalized = fields.Str(allow_none=True)
|
|
12
|
+
synonyms = fields.Raw(allow_none=True) # JSON array
|
|
13
|
+
modality = fields.Str(allow_none=True)
|
|
14
|
+
targets = fields.Raw(allow_none=True) # JSON array
|
|
15
|
+
mechanism_of_action = fields.Str(allow_none=True)
|
|
16
|
+
indications = fields.Raw(required=True) # JSON array
|
|
17
|
+
additional_info = fields.Raw(allow_none=True) # JSON object
|
|
18
|
+
extraction_method = fields.Str(required=True)
|
|
19
|
+
extraction_confidence = fields.Float(allow_none=True)
|
|
20
|
+
raw_extraction_text = fields.Str(allow_none=True)
|
|
21
|
+
is_verified = fields.Bool(allow_none=True)
|
|
22
|
+
is_duplicate = fields.Bool(allow_none=True)
|
|
23
|
+
needs_review = fields.Bool(allow_none=True)
|
|
24
|
+
review_notes = fields.Str(allow_none=True)
|
|
25
|
+
extracted_at = fields.DateTime(required=True)
|
|
26
|
+
created_at = fields.DateTime(dump_only=True)
|
|
27
|
+
updated_at = fields.DateTime(dump_only=True)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PipelineDrugPortfolioResourceSchema(PipelineDrugPortfolioSchema):
|
|
31
|
+
pass
|
|
File without changes
|