cs-models 0.0.827__py3-none-any.whl → 0.0.847__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. cs_models/resources/CompanyOUS/models.py +2 -0
  2. cs_models/resources/CompanyOUS/schemas.py +4 -0
  3. cs_models/resources/CompanySEC/models.py +2 -0
  4. cs_models/resources/CompanySEC/schemas.py +4 -0
  5. cs_models/resources/DeepResearchAgenticUnit/__init__.py +14 -0
  6. cs_models/resources/DeepResearchAgenticUnit/models.py +123 -0
  7. cs_models/resources/DeepResearchAgenticUnit/schemas.py +50 -0
  8. cs_models/resources/DeepResearchSession/__init__.py +20 -0
  9. cs_models/resources/DeepResearchSession/models.py +170 -0
  10. cs_models/resources/DeepResearchSession/schemas.py +94 -0
  11. cs_models/resources/DeepResearchSubTask/__init__.py +20 -0
  12. cs_models/resources/DeepResearchSubTask/models.py +177 -0
  13. cs_models/resources/DeepResearchSubTask/schemas.py +105 -0
  14. cs_models/resources/MeetingUserDocument/__init__.py +0 -0
  15. cs_models/resources/MeetingUserDocument/models.py +39 -0
  16. cs_models/resources/MeetingUserDocument/schemas.py +17 -0
  17. cs_models/resources/PipelineCrawlSession/__init__.py +0 -0
  18. cs_models/resources/PipelineCrawlSession/models.py +67 -0
  19. cs_models/resources/PipelineCrawlSession/schemas.py +22 -0
  20. cs_models/resources/PipelineCrawledPage/__init__.py +0 -0
  21. cs_models/resources/PipelineCrawledPage/models.py +80 -0
  22. cs_models/resources/PipelineCrawledPage/schemas.py +34 -0
  23. cs_models/resources/PipelineDrugPortfolio/__init__.py +0 -0
  24. cs_models/resources/PipelineDrugPortfolio/models.py +92 -0
  25. cs_models/resources/PipelineDrugPortfolio/schemas.py +31 -0
  26. cs_models/resources/PipelineExtractionLog/__init__.py +0 -0
  27. cs_models/resources/PipelineExtractionLog/models.py +55 -0
  28. cs_models/resources/PipelineExtractionLog/schemas.py +23 -0
  29. cs_models/resources/PubmedMeetingSellSideSignal/__init__.py +0 -0
  30. cs_models/resources/PubmedMeetingSellSideSignal/models.py +64 -0
  31. cs_models/resources/PubmedMeetingSellSideSignal/schemas.py +21 -0
  32. cs_models/resources/PubmedMeetingUserDocument/__init__.py +0 -0
  33. cs_models/resources/PubmedMeetingUserDocument/models.py +40 -0
  34. cs_models/resources/PubmedMeetingUserDocument/schemas.py +16 -0
  35. cs_models/resources/SellSideAbstractMention/__init__.py +0 -0
  36. cs_models/resources/SellSideAbstractMention/models.py +57 -0
  37. cs_models/resources/SellSideAbstractMention/schemas.py +28 -0
  38. cs_models/resources/SellSideAbstractMentionLink/__init__.py +0 -0
  39. cs_models/resources/SellSideAbstractMentionLink/models.py +60 -0
  40. cs_models/resources/SellSideAbstractMentionLink/schemas.py +24 -0
  41. cs_models/resources/SellSideSource/__init__.py +0 -0
  42. cs_models/resources/SellSideSource/models.py +25 -0
  43. cs_models/resources/SellSideSource/schemas.py +13 -0
  44. cs_models/resources/UserDocument/models.py +7 -0
  45. cs_models/resources/UserDocument/schemas.py +2 -0
  46. cs_models/resources/UserDocumentAccess/models.py +6 -0
  47. cs_models/resources/UserDocumentAccess/schemas.py +1 -0
  48. cs_models/resources/Workbook/models.py +9 -0
  49. cs_models/resources/Workbook/schemas.py +6 -0
  50. cs_models/resources/WorkbookCommentThread/__init__.py +0 -0
  51. cs_models/resources/WorkbookCommentThread/models.py +59 -0
  52. cs_models/resources/WorkbookCommentThread/schemas.py +35 -0
  53. cs_models/resources/WorkbookThreadComment/__init__.py +0 -0
  54. cs_models/resources/WorkbookThreadComment/models.py +38 -0
  55. cs_models/resources/WorkbookThreadComment/schemas.py +14 -0
  56. {cs_models-0.0.827.dist-info → cs_models-0.0.847.dist-info}/METADATA +1 -1
  57. {cs_models-0.0.827.dist-info → cs_models-0.0.847.dist-info}/RECORD +59 -14
  58. {cs_models-0.0.827.dist-info → cs_models-0.0.847.dist-info}/WHEEL +0 -0
  59. {cs_models-0.0.827.dist-info → cs_models-0.0.847.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,177 @@
1
+ """Models for Deep Research SubTasks - Individual research agents."""
2
+ import enum
3
+ from datetime import datetime
4
+
5
+ from sqlalchemy import (
6
+ Column,
7
+ DateTime,
8
+ Integer,
9
+ String,
10
+ Text,
11
+ Boolean,
12
+ ForeignKey,
13
+ Enum,
14
+ Float,
15
+ )
16
+ from sqlalchemy.orm import relationship
17
+
18
+ from ...database import Base
19
+
20
+
21
+ class SubTaskStatusEnum(enum.Enum):
22
+ """Status enum for deep research subtasks."""
23
+ PENDING = "PENDING"
24
+ QUEUED = "QUEUED"
25
+ RUNNING = "RUNNING"
26
+ COMPLETED = "COMPLETED"
27
+ FAILED = "FAILED"
28
+ SKIPPED = "SKIPPED"
29
+ EXPANDED = "EXPANDED" # Template task that has been expanded into child tasks
30
+
31
+
32
+ class SubTaskTypeEnum(enum.Enum):
33
+ """Type enum for research dimensions and special task types."""
34
+ # Research dimensions
35
+ CUSTOM = "CUSTOM"
36
+ INVESTIGATION = "INVESTIGATION"
37
+ DISCOVERY = "DISCOVERY" # Entity discovery tasks
38
+ SMART_GRID = "SMART_GRID" # SmartGrid matrix research (items × dimensions)
39
+ AGGREGATION = "AGGREGATION" # Aggregation of other task results
40
+
41
+
42
+ class DeepResearchSubTaskModel(Base):
43
+ """
44
+ Model for Deep Research SubTasks.
45
+
46
+ Each subtask represents an independent research agent that executes
47
+ with a fresh context window. Subtasks do not communicate with each
48
+ other - all coordination flows through the main orchestrator.
49
+ """
50
+
51
+ __tablename__ = "deep_research_subtasks"
52
+
53
+ id = Column(Integer, primary_key=True)
54
+
55
+ # Parent session
56
+ session_id = Column(
57
+ Integer,
58
+ ForeignKey("deep_research_sessions.id"),
59
+ nullable=False,
60
+ )
61
+
62
+ # V2: Parent agentic unit (nullable for backward compatibility with v1)
63
+ agentic_unit_id = Column(
64
+ Integer,
65
+ ForeignKey("deep_research_agentic_units.id"),
66
+ nullable=True,
67
+ )
68
+
69
+ # Task identification
70
+ task_id = Column(String(100), nullable=True) # Logical task ID (e.g., "t1", "t2_ide-cel")
71
+ sequence_number = Column(Integer, nullable=False, default=0)
72
+ task_type = Column(
73
+ "task_type",
74
+ Enum(SubTaskTypeEnum),
75
+ default=SubTaskTypeEnum.CUSTOM,
76
+ nullable=False,
77
+ )
78
+ task_label = Column(String(256), nullable=True)
79
+
80
+ # Template/Expansion tracking
81
+ is_template = Column(Boolean, nullable=True, default=False) # True if this is a template task
82
+ parent_task_id = Column(
83
+ Integer,
84
+ ForeignKey("deep_research_subtasks.id"),
85
+ nullable=True,
86
+ ) # Links expanded tasks to their parent template
87
+
88
+ # Task specification (JSON)
89
+ specification = Column(Text, nullable=False) # JSON with focus_question, entities, etc.
90
+ focus_question = Column(Text, nullable=True) # Denormalized for quick access
91
+ entities = Column(Text, nullable=True) # JSON array of entity objects
92
+ search_scope = Column(Text, nullable=True) # JSON array: ["PUBLICATION", "CLINICAL_TRIAL"]
93
+ expected_output_format = Column(String(50), nullable=True) # structured, narrative, table_row
94
+ time_range_start = Column(DateTime, nullable=True)
95
+ time_range_end = Column(DateTime, nullable=True)
96
+
97
+ # Entity discovery (for DISCOVERY tasks and for_each_entity_from pattern)
98
+ output_entities = Column(Boolean, nullable=True, default=False) # True if this task outputs entities
99
+ entity_type = Column(String(50), nullable=True) # Type of entities discovered (e.g., "drug", "event")
100
+ for_each_entity_from = Column(String(100), nullable=True) # task_id of parent task to get entities from
101
+ entity_name = Column(String(256), nullable=True) # Entity name this expanded task was created for
102
+ entity_data = Column(Text, nullable=True) # JSON: Full entity info including synonyms
103
+ discovered_entities = Column(Text, nullable=True) # JSON: Entities discovered by this task
104
+
105
+ # Execution status
106
+ status = Column(
107
+ "status",
108
+ Enum(SubTaskStatusEnum),
109
+ default=SubTaskStatusEnum.PENDING,
110
+ nullable=False,
111
+ )
112
+ lambda_request_id = Column(String(256), nullable=True)
113
+ started_at = Column(DateTime, nullable=True)
114
+ completed_at = Column(DateTime, nullable=True)
115
+
116
+ # Dependencies (for sequential tasks)
117
+ depends_on = Column(Text, nullable=True) # JSON array of subtask IDs
118
+ priority = Column(Integer, nullable=True, default=1)
119
+
120
+ # Results
121
+ result_s3_key = Column(String(512), nullable=True)
122
+ result_summary = Column(Text, nullable=True) # Brief summary of findings
123
+ citations_count = Column(Integer, nullable=True, default=0)
124
+ documents_analyzed = Column(Integer, nullable=True, default=0)
125
+
126
+ # SmartGrid integration (for SMART_GRID task type)
127
+ smart_grid_id = Column(Integer, nullable=True) # FK to SmartGridModel (not enforced)
128
+ smart_grid_analysis_type = Column(String(50), nullable=True) # "row", "column", or "both"
129
+
130
+ # Quality metrics
131
+ confidence_score = Column(Float, nullable=True) # 0.0 - 1.0
132
+ relevance_score = Column(Float, nullable=True) # 0.0 - 1.0
133
+ coverage_score = Column(Float, nullable=True) # How well the question was answered
134
+
135
+ # Execution metrics
136
+ execution_time_ms = Column(Integer, nullable=True)
137
+ tokens_used = Column(Integer, nullable=True)
138
+ search_queries_count = Column(Integer, nullable=True)
139
+
140
+ # Error handling
141
+ error_message = Column(Text, nullable=True)
142
+ error_type = Column(String(100), nullable=True)
143
+ retry_count = Column(Integer, nullable=True, default=0)
144
+
145
+ # Soft delete and timestamps
146
+ is_deleted = Column(Boolean, nullable=True)
147
+ created_at = Column(
148
+ DateTime,
149
+ nullable=False,
150
+ default=lambda: datetime.utcnow(),
151
+ )
152
+ updated_at = Column(
153
+ DateTime,
154
+ nullable=False,
155
+ default=lambda: datetime.utcnow(),
156
+ onupdate=lambda: datetime.utcnow(),
157
+ )
158
+
159
+ # ORM Relationships
160
+ session = relationship(
161
+ "DeepResearchSessionModel",
162
+ back_populates="subtasks",
163
+ )
164
+
165
+ # Self-referential relationship for template → expanded tasks
166
+ parent_task = relationship(
167
+ "DeepResearchSubTaskModel",
168
+ remote_side=[id],
169
+ backref="expanded_tasks",
170
+ foreign_keys=[parent_task_id],
171
+ )
172
+
173
+ # V2: Relationship to parent agentic unit
174
+ agentic_unit = relationship(
175
+ "DeepResearchAgenticUnitModel",
176
+ back_populates="internal_tasks",
177
+ )
@@ -0,0 +1,105 @@
1
+ """Schemas for Deep Research SubTasks."""
2
+ from marshmallow import Schema, fields, validate
3
+
4
+ from .models import SubTaskStatusEnum, SubTaskTypeEnum
5
+
6
+
7
+ class DeepResearchSubTaskResourceSchema(Schema):
8
+ """Schema for DeepResearchSubTaskModel."""
9
+
10
+ not_blank = validate.Length(min=1, error="Field cannot be blank")
11
+
12
+ id = fields.Integer(dump_only=True)
13
+ session_id = fields.Integer(required=True)
14
+
15
+ # Task identification
16
+ task_id = fields.String(allow_none=True) # Logical task ID (e.g., "t1", "t2_ide-cel")
17
+ sequence_number = fields.Integer(required=True)
18
+ task_type = fields.Enum(SubTaskTypeEnum, by_value=True)
19
+ task_label = fields.String(allow_none=True)
20
+
21
+ # Template/Expansion tracking
22
+ is_template = fields.Boolean(allow_none=True)
23
+ parent_task_id = fields.Integer(allow_none=True)
24
+
25
+ # Task specification
26
+ specification = fields.String(required=True)
27
+ focus_question = fields.String(allow_none=True)
28
+ entities = fields.String(allow_none=True)
29
+ search_scope = fields.String(allow_none=True)
30
+ expected_output_format = fields.String(allow_none=True)
31
+ time_range_start = fields.DateTime(allow_none=True)
32
+ time_range_end = fields.DateTime(allow_none=True)
33
+
34
+ # Entity discovery fields
35
+ output_entities = fields.Boolean(allow_none=True)
36
+ entity_type = fields.String(allow_none=True)
37
+ for_each_entity_from = fields.String(allow_none=True)
38
+ entity_name = fields.String(allow_none=True)
39
+ entity_data = fields.String(allow_none=True)
40
+ discovered_entities = fields.String(allow_none=True)
41
+
42
+ # Execution status
43
+ status = fields.Enum(SubTaskStatusEnum, by_value=True)
44
+ lambda_request_id = fields.String(allow_none=True)
45
+ started_at = fields.DateTime(allow_none=True)
46
+ completed_at = fields.DateTime(allow_none=True)
47
+
48
+ # Dependencies
49
+ depends_on = fields.String(allow_none=True)
50
+ priority = fields.Integer(allow_none=True)
51
+
52
+ # Results
53
+ result_s3_key = fields.String(allow_none=True)
54
+ result_summary = fields.String(allow_none=True)
55
+ citations_count = fields.Integer(allow_none=True)
56
+ documents_analyzed = fields.Integer(allow_none=True)
57
+
58
+ # Smart Grid
59
+ smart_grid_id = fields.Integer(allow_none=True)
60
+ smart_grid_analysis_type = fields.String(allow_none=True)
61
+
62
+ # Quality metrics
63
+ confidence_score = fields.Float(allow_none=True)
64
+ relevance_score = fields.Float(allow_none=True)
65
+ coverage_score = fields.Float(allow_none=True)
66
+
67
+ # Execution metrics
68
+ execution_time_ms = fields.Integer(allow_none=True)
69
+ tokens_used = fields.Integer(allow_none=True)
70
+ search_queries_count = fields.Integer(allow_none=True)
71
+
72
+ # Error handling
73
+ error_message = fields.String(allow_none=True)
74
+ error_type = fields.String(allow_none=True)
75
+ retry_count = fields.Integer(allow_none=True)
76
+
77
+ # Timestamps
78
+ is_deleted = fields.Boolean(allow_none=True)
79
+ created_at = fields.DateTime(dump_only=True)
80
+ updated_at = fields.DateTime(dump_only=True)
81
+
82
+
83
+ class DeepResearchSubTaskSpecSchema(Schema):
84
+ """Schema for subtask specification object."""
85
+
86
+ focus_question = fields.String(required=True)
87
+ entities = fields.List(fields.Dict(), allow_none=True)
88
+ search_scope = fields.List(fields.String(), allow_none=True)
89
+ output_format = fields.String(allow_none=True)
90
+ expected_fields = fields.List(fields.String(), allow_none=True)
91
+ max_documents = fields.Integer(allow_none=True)
92
+ time_range = fields.List(fields.String(), allow_none=True)
93
+ retrieval_hints = fields.Dict(allow_none=True)
94
+
95
+
96
+ class DeepResearchSubTaskCreateSchema(Schema):
97
+ """Schema for creating a subtask."""
98
+
99
+ session_id = fields.Integer(required=True)
100
+ sequence_number = fields.Integer(required=True)
101
+ task_type = fields.Enum(SubTaskTypeEnum, by_value=True, required=True)
102
+ task_label = fields.String(allow_none=True)
103
+ specification = fields.Nested(DeepResearchSubTaskSpecSchema, required=True)
104
+ depends_on = fields.List(fields.Integer(), allow_none=True)
105
+ priority = fields.Integer(allow_none=True)
File without changes
@@ -0,0 +1,39 @@
1
+ from datetime import datetime
2
+
3
+ from sqlalchemy import (
4
+ Column,
5
+ Text,
6
+ Integer,
7
+ String,
8
+ Boolean,
9
+ DateTime,
10
+ ForeignKey,
11
+ )
12
+
13
+ from ...database import Base
14
+
15
+
16
+ class MeetingUserDocumentModel(Base):
17
+ __tablename__ = "meeting_user_documents"
18
+
19
+ id = Column(Integer, primary_key=True)
20
+ meeting_id = Column(
21
+ Integer,
22
+ ForeignKey('meetings.id'),
23
+ nullable=False,
24
+ )
25
+ user_document_id = Column(
26
+ Integer,
27
+ ForeignKey('user_documents.id'),
28
+ nullable=False,
29
+ )
30
+ status = Column(String(50), nullable=False)
31
+ is_active = Column(Boolean, nullable=True)
32
+ details = Column(Text, nullable=True)
33
+ updated_at = Column(
34
+ DateTime,
35
+ nullable=False,
36
+ # https://stackoverflow.com/questions/58776476/why-doesnt-freezegun-work-with-sqlalchemy-default-values
37
+ default=lambda: datetime.utcnow(),
38
+ onupdate=lambda: datetime.utcnow(),
39
+ )
@@ -0,0 +1,17 @@
1
+ from marshmallow import (
2
+ Schema,
3
+ fields,
4
+ validate,
5
+ )
6
+
7
+
8
+ class MeetingUserDocumentResourceSchema(Schema):
9
+ not_blank = validate.Length(min=1, error='Field cannot be blank')
10
+
11
+ id = fields.Integer(dump_only=True)
12
+ meeting_id = fields.Integer(required=True)
13
+ user_document_id = fields.Integer(required=True)
14
+ status = fields.String(required=True)
15
+ is_active = fields.Boolean(allow_none=True)
16
+ details = fields.String(allow_none=True)
17
+ updated_at = fields.DateTime()
File without changes
@@ -0,0 +1,67 @@
1
+ from sqlalchemy import (
2
+ Column,
3
+ Integer,
4
+ String,
5
+ DateTime,
6
+ Float,
7
+ Text,
8
+ ForeignKey,
9
+ Enum,
10
+ )
11
+ from datetime import datetime
12
+
13
+ from ...database import Base
14
+
15
+
16
+ class PipelineCrawlSessionModel(Base):
17
+ __tablename__ = 'pipeline_crawl_sessions'
18
+
19
+ id = Column(Integer, primary_key=True)
20
+ company_sec_id = Column(
21
+ Integer,
22
+ ForeignKey('companies_sec.id'),
23
+ nullable=True,
24
+ index=True,
25
+ )
26
+ company_ous_id = Column(
27
+ Integer,
28
+ ForeignKey('companies_ous.id'),
29
+ nullable=True,
30
+ index=True,
31
+ )
32
+
33
+ # Session tracking
34
+ session_uuid = Column(String(36), nullable=False, unique=True, index=True)
35
+ crawl_date = Column(DateTime, nullable=False, default=datetime.utcnow, index=True)
36
+
37
+ # Timing metrics
38
+ crawl_duration_seconds = Column(Float, nullable=True)
39
+ page_discovery_time_seconds = Column(Float, nullable=True)
40
+ extraction_time_seconds = Column(Float, nullable=True)
41
+
42
+ # Status tracking (MySQL ENUM)
43
+ status = Column(
44
+ Enum('started', 'discovering_pages', 'extracting_data', 'completed', 'failed', 'timeout'),
45
+ nullable=False,
46
+ default='started',
47
+ index=True,
48
+ )
49
+ error_message = Column(Text, nullable=True)
50
+
51
+ # Crawl statistics
52
+ total_pages_discovered = Column(Integer, nullable=True, default=0)
53
+ total_pages_crawled = Column(Integer, nullable=True, default=0)
54
+ total_screenshots_captured = Column(Integer, nullable=True, default=0)
55
+ total_drugs_extracted = Column(Integer, nullable=True, default=0)
56
+
57
+ # Crawl configuration
58
+ crawl_method = Column(String(50), nullable=True)
59
+ max_depth = Column(Integer, nullable=True, default=2)
60
+
61
+ created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
62
+ updated_at = Column(
63
+ DateTime,
64
+ nullable=False,
65
+ default=datetime.utcnow,
66
+ onupdate=datetime.utcnow,
67
+ )
@@ -0,0 +1,22 @@
1
+ from marshmallow import Schema, fields
2
+
3
+
4
+ class PipelineCrawlSessionSchema(Schema):
5
+ id = fields.Int(dump_only=True)
6
+ company_sec_id = fields.Int(allow_none=True)
7
+ company_ous_id = fields.Int(allow_none=True)
8
+ session_uuid = fields.Str(required=True)
9
+ crawl_date = fields.DateTime(required=True)
10
+ crawl_duration_seconds = fields.Float(allow_none=True)
11
+ page_discovery_time_seconds = fields.Float(allow_none=True)
12
+ extraction_time_seconds = fields.Float(allow_none=True)
13
+ status = fields.Str(required=True)
14
+ error_message = fields.Str(allow_none=True)
15
+ total_pages_discovered = fields.Int(allow_none=True)
16
+ total_pages_crawled = fields.Int(allow_none=True)
17
+ total_screenshots_captured = fields.Int(allow_none=True)
18
+ total_drugs_extracted = fields.Int(allow_none=True)
19
+ crawl_method = fields.Str(allow_none=True)
20
+ max_depth = fields.Int(allow_none=True)
21
+ created_at = fields.DateTime(dump_only=True)
22
+ updated_at = fields.DateTime(dump_only=True)
File without changes
@@ -0,0 +1,80 @@
1
+ from sqlalchemy import (
2
+ Column,
3
+ Integer,
4
+ String,
5
+ DateTime,
6
+ Float,
7
+ Boolean,
8
+ ForeignKey,
9
+ Enum,
10
+ Index,
11
+ UniqueConstraint,
12
+ )
13
+ from datetime import datetime
14
+
15
+ from ...database import Base
16
+
17
+
18
+ class PipelineCrawledPageModel(Base):
19
+ __tablename__ = 'pipeline_crawled_pages'
20
+
21
+ id = Column(Integer, primary_key=True)
22
+ session_id = Column(
23
+ Integer,
24
+ ForeignKey('pipeline_crawl_sessions.id'),
25
+ nullable=False,
26
+ index=True,
27
+ )
28
+
29
+ # URL information
30
+ url = Column(String(1024), nullable=False)
31
+ url_hash = Column(String(64), nullable=False, index=True)
32
+ page_title = Column(String(512), nullable=True)
33
+
34
+ # Page content storage (S3)
35
+ html_content_s3_key = Column(String(512), nullable=True)
36
+ html_content_hash = Column(String(64), nullable=True)
37
+ html_content_length = Column(Integer, nullable=True)
38
+
39
+ # Screenshot storage (S3)
40
+ screenshot_s3_key = Column(String(512), nullable=True)
41
+ screenshot_hash = Column(String(64), nullable=True)
42
+ screenshot_width = Column(Integer, nullable=True)
43
+ screenshot_height = Column(Integer, nullable=True)
44
+ screenshot_file_size = Column(Integer, nullable=True)
45
+
46
+ # Page classification (MySQL ENUM)
47
+ page_type = Column(
48
+ Enum('pipeline', 'science', 'rd', 'products', 'clinical_trials', 'other'),
49
+ nullable=True,
50
+ )
51
+ relevance_score = Column(Float, nullable=True)
52
+
53
+ # Extraction metadata
54
+ has_drug_data = Column(Boolean, nullable=True, default=False)
55
+ extraction_method = Column(String(50), nullable=True)
56
+
57
+ # Crawl metadata
58
+ crawl_depth = Column(Integer, nullable=True, default=0)
59
+ parent_page_id = Column(Integer, ForeignKey('pipeline_crawled_pages.id'), nullable=True)
60
+ discovered_from_url = Column(String(1024), nullable=True)
61
+
62
+ # Technical metadata
63
+ http_status_code = Column(Integer, nullable=True)
64
+ content_type = Column(String(128), nullable=True)
65
+ response_time_ms = Column(Integer, nullable=True)
66
+
67
+ # Timestamps
68
+ crawled_at = Column(DateTime, nullable=False, default=datetime.utcnow)
69
+ created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
70
+ updated_at = Column(
71
+ DateTime,
72
+ nullable=False,
73
+ default=datetime.utcnow,
74
+ onupdate=datetime.utcnow,
75
+ )
76
+
77
+ __table_args__ = (
78
+ Index('idx_session_url_hash', 'session_id', 'url_hash'),
79
+ UniqueConstraint('session_id', 'url_hash', name='uq_session_url'),
80
+ )
@@ -0,0 +1,34 @@
1
+ from marshmallow import Schema, fields
2
+
3
+
4
+ class PipelineCrawledPageSchema(Schema):
5
+ id = fields.Int(dump_only=True)
6
+ session_id = fields.Int(required=True)
7
+ url = fields.Str(required=True)
8
+ url_hash = fields.Str(required=True)
9
+ page_title = fields.Str(allow_none=True)
10
+ html_content_s3_key = fields.Str(allow_none=True)
11
+ html_content_hash = fields.Str(allow_none=True)
12
+ html_content_length = fields.Int(allow_none=True)
13
+ screenshot_s3_key = fields.Str(allow_none=True)
14
+ screenshot_hash = fields.Str(allow_none=True)
15
+ screenshot_width = fields.Int(allow_none=True)
16
+ screenshot_height = fields.Int(allow_none=True)
17
+ screenshot_file_size = fields.Int(allow_none=True)
18
+ page_type = fields.Str(allow_none=True)
19
+ relevance_score = fields.Float(allow_none=True)
20
+ has_drug_data = fields.Bool(allow_none=True)
21
+ extraction_method = fields.Str(allow_none=True)
22
+ crawl_depth = fields.Int(allow_none=True)
23
+ parent_page_id = fields.Int(allow_none=True)
24
+ discovered_from_url = fields.Str(allow_none=True)
25
+ http_status_code = fields.Int(allow_none=True)
26
+ content_type = fields.Str(allow_none=True)
27
+ response_time_ms = fields.Int(allow_none=True)
28
+ crawled_at = fields.DateTime(required=True)
29
+ created_at = fields.DateTime(dump_only=True)
30
+ updated_at = fields.DateTime(dump_only=True)
31
+
32
+
33
+ class PipelineCrawledPageResourceSchema(PipelineCrawledPageSchema):
34
+ pass
File without changes
@@ -0,0 +1,92 @@
1
+ from sqlalchemy import (
2
+ Column,
3
+ BIGINT,
4
+ Integer,
5
+ String,
6
+ DateTime,
7
+ Float,
8
+ Text,
9
+ Boolean,
10
+ ForeignKey,
11
+ Enum,
12
+ Index,
13
+ JSON,
14
+ )
15
+ from datetime import datetime
16
+
17
+ from ...database import Base
18
+
19
+
20
+ class PipelineDrugPortfolioModel(Base):
21
+ __tablename__ = 'pipeline_drug_portfolio'
22
+
23
+ id = Column(BIGINT, primary_key=True)
24
+ session_id = Column(
25
+ Integer,
26
+ ForeignKey('pipeline_crawl_sessions.id'),
27
+ nullable=False,
28
+ index=True,
29
+ )
30
+ page_id = Column(
31
+ Integer,
32
+ ForeignKey('pipeline_crawled_pages.id'),
33
+ nullable=False,
34
+ index=True,
35
+ )
36
+
37
+ company_sec_id = Column(
38
+ Integer,
39
+ ForeignKey('companies_sec.id'),
40
+ nullable=True,
41
+ index=True,
42
+ )
43
+ company_ous_id = Column(
44
+ Integer,
45
+ ForeignKey('companies_ous.id'),
46
+ nullable=True,
47
+ index=True,
48
+ )
49
+
50
+ # Core drug information
51
+ drug_name = Column(String(512), nullable=False, index=True)
52
+ drug_name_normalized = Column(String(512), nullable=True, index=True)
53
+ synonyms = Column(JSON, nullable=True)
54
+
55
+ modality = Column(String(256), nullable=True)
56
+ targets = Column(JSON, nullable=True)
57
+ mechanism_of_action = Column(Text, nullable=True)
58
+
59
+ # Indications (JSON)
60
+ indications = Column(JSON, nullable=False)
61
+
62
+ # Additional information (JSON)
63
+ additional_info = Column(JSON, nullable=True)
64
+
65
+ # Extraction metadata (MySQL ENUM)
66
+ extraction_method = Column(
67
+ Enum('html_parsing', 'vision_model', 'hybrid', 'manual'),
68
+ nullable=False,
69
+ )
70
+ extraction_confidence = Column(Float, nullable=True)
71
+ raw_extraction_text = Column(Text, nullable=True)
72
+
73
+ # Data quality flags
74
+ is_verified = Column(Boolean, nullable=True, default=False)
75
+ is_duplicate = Column(Boolean, nullable=True, default=False, index=True)
76
+ needs_review = Column(Boolean, nullable=True, default=False, index=True)
77
+ review_notes = Column(Text, nullable=True)
78
+
79
+ # Timestamps
80
+ extracted_at = Column(DateTime, nullable=False, default=datetime.utcnow)
81
+ created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
82
+ updated_at = Column(
83
+ DateTime,
84
+ nullable=False,
85
+ default=datetime.utcnow,
86
+ onupdate=datetime.utcnow,
87
+ )
88
+
89
+ __table_args__ = (
90
+ Index('idx_company_drug_name', 'company_sec_id', 'drug_name_normalized'),
91
+ Index('idx_session_drug', 'session_id', 'drug_name_normalized'),
92
+ )
@@ -0,0 +1,31 @@
1
+ from marshmallow import Schema, fields
2
+
3
+
4
+ class PipelineDrugPortfolioSchema(Schema):
5
+ id = fields.Int(dump_only=True)
6
+ session_id = fields.Int(required=True)
7
+ page_id = fields.Int(required=True)
8
+ company_sec_id = fields.Int(allow_none=True)
9
+ company_ous_id = fields.Int(allow_none=True)
10
+ drug_name = fields.Str(required=True)
11
+ drug_name_normalized = fields.Str(allow_none=True)
12
+ synonyms = fields.Raw(allow_none=True) # JSON array
13
+ modality = fields.Str(allow_none=True)
14
+ targets = fields.Raw(allow_none=True) # JSON array
15
+ mechanism_of_action = fields.Str(allow_none=True)
16
+ indications = fields.Raw(required=True) # JSON array
17
+ additional_info = fields.Raw(allow_none=True) # JSON object
18
+ extraction_method = fields.Str(required=True)
19
+ extraction_confidence = fields.Float(allow_none=True)
20
+ raw_extraction_text = fields.Str(allow_none=True)
21
+ is_verified = fields.Bool(allow_none=True)
22
+ is_duplicate = fields.Bool(allow_none=True)
23
+ needs_review = fields.Bool(allow_none=True)
24
+ review_notes = fields.Str(allow_none=True)
25
+ extracted_at = fields.DateTime(required=True)
26
+ created_at = fields.DateTime(dump_only=True)
27
+ updated_at = fields.DateTime(dump_only=True)
28
+
29
+
30
+ class PipelineDrugPortfolioResourceSchema(PipelineDrugPortfolioSchema):
31
+ pass
File without changes