welearn-database 1.0.0.dev1__tar.gz → 1.0.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/PKG-INFO +1 -1
  2. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/pyproject.toml +1 -1
  3. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/data/enumeration.py +2 -13
  4. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/data/models/corpus_related.py +105 -4
  5. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/data/models/document_related.py +36 -113
  6. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/LICENSE +0 -0
  7. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/README.md +0 -0
  8. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/__init__.py +0 -0
  9. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/README +0 -0
  10. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/env.py +0 -0
  11. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/script.py.mako +0 -0
  12. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/16ff997426d3_remove_error_retrieval_unique_constraint.py +0 -0
  13. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/4c7161819e5a_grafana_views.py +0 -0
  14. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/4fcbfb7f3145_added_api_key_management_table.py +0 -0
  15. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/5d82613c9aca_context_document.py +0 -0
  16. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/821173cf9c5d_initial_migration.py +0 -0
  17. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/89920abb7ff8_add_category.py +0 -0
  18. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/a50a1db3ca2a_add_used_since_column_for_embeddings.py +0 -0
  19. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/b031206324b7_agent_related.py +0 -0
  20. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/alembic/versions/e354666f951d_inferred_user.py +0 -0
  21. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/data/__init__.py +0 -0
  22. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/data/models/__init__.py +0 -0
  23. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/data/models/agent_related.py +0 -0
  24. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/data/models/grafana.py +0 -0
  25. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/data/models/user_related.py +0 -0
  26. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/database_utils.py +0 -0
  27. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/exceptions.py +0 -0
  28. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/modules/__init__.py +0 -0
  29. {welearn_database-1.0.0.dev1 → welearn_database-1.0.0.dev2}/welearn_database/modules/text_cleaning.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: welearn-database
3
- Version: 1.0.0.dev1
3
+ Version: 1.0.0.dev2
4
4
  Summary: All stuff related to relationnal database from the WeLearn project
5
5
  License: cc-by-sa-nc
6
6
  Author: Théo
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "welearn-database"
3
- version = "1.0.0.dev1"
3
+ version = "1.0.0.dev2"
4
4
  description = "All stuff related to relationnal database from the WeLearn project"
5
5
  authors = [
6
6
  {name = "Théo",email = "theo.nardin@cri-paris.org"}
@@ -1,4 +1,4 @@
1
- from enum import Enum, StrEnum, auto
1
+ from enum import Enum, auto, StrEnum
2
2
 
3
3
 
4
4
  class Step(Enum):
@@ -13,11 +13,9 @@ class Step(Enum):
13
13
  KEPT_FOR_TRACE = "kept_for_trace"
14
14
  DOCUMENT_IS_IRRETRIEVABLE = "document_is_irretrievable"
15
15
 
16
-
17
16
  class Counter(Enum):
18
17
  HIT = auto()
19
18
 
20
-
21
19
  class DbSchemaEnum(StrEnum):
22
20
  GRAFANA = auto()
23
21
  AGENT_RELATED = auto()
@@ -25,16 +23,7 @@ class DbSchemaEnum(StrEnum):
25
23
  DOCUMENT_RELATED = auto()
26
24
  USER_RELATED = auto()
27
25
 
28
-
29
26
  class ContextType(StrEnum):
30
27
  INTRODUCTION = auto()
31
28
  TARGET = auto()
32
- SUBJECT = auto()
33
-
34
-
35
- class ExternalIdType(StrEnum):
36
- DOI = auto()
37
- API_ID = auto()
38
- HANDLE = auto()
39
- SLUG = auto()
40
- QID = auto()
29
+ SUBJECT = auto()
@@ -1,15 +1,17 @@
1
1
  from datetime import datetime
2
2
  from uuid import UUID
3
3
 
4
- from sqlalchemy import types, ForeignKey, func
4
+ from sqlalchemy import ForeignKey, UniqueConstraint, func, types
5
5
  from sqlalchemy.dialects.postgresql import TIMESTAMP
6
- from sqlalchemy.orm import mapped_column, Mapped
6
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
7
7
 
8
- from . import Base
9
8
  from welearn_database.data.enumeration import DbSchemaEnum
10
9
 
10
+ from . import Base
11
+
11
12
  schema_name = DbSchemaEnum.CORPUS_RELATED.value
12
13
 
14
+
13
15
  class Corpus(Base):
14
16
  __tablename__ = "corpus"
15
17
  __table_args__ = {"schema": schema_name}
@@ -26,6 +28,7 @@ class Corpus(Base):
26
28
  ForeignKey(f"{schema_name}.category.id"),
27
29
  )
28
30
 
31
+
29
32
  class Category(Base):
30
33
  __tablename__ = "category"
31
34
  __table_args__ = {"schema": schema_name}
@@ -97,11 +100,109 @@ class NClassifierModel(Base):
97
100
  server_default="NOW()",
98
101
  )
99
102
 
103
+
100
104
  class CorpusNameEmbeddingModelLang(Base):
101
105
  __tablename__ = "corpus_name_embedding_model_lang"
102
106
  __table_args__ = {"schema": schema_name}
103
107
  __read_only__ = True
104
- source_name : Mapped[str]= mapped_column(primary_key=True)
108
+ source_name: Mapped[str] = mapped_column(primary_key=True)
105
109
  title: Mapped[str]
106
110
  lang: Mapped[str]
107
111
 
112
+
113
+ class CorpusEmbeddingModel(Base):
114
+ __tablename__ = "corpus_embedding_model"
115
+ __table_args__ = (
116
+ UniqueConstraint(
117
+ "corpus_id",
118
+ "embedding_model_id",
119
+ name="unique_corpus_embedding_association",
120
+ ),
121
+ {"schema": schema_name},
122
+ )
123
+
124
+ corpus_id = mapped_column(
125
+ types.Uuid,
126
+ ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.corpus.id"),
127
+ primary_key=True,
128
+ )
129
+ embedding_model_id = mapped_column(
130
+ types.Uuid,
131
+ ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.embedding_model.id"),
132
+ primary_key=True,
133
+ )
134
+
135
+ used_since: Mapped[datetime] = mapped_column(
136
+ TIMESTAMP(timezone=False),
137
+ nullable=False,
138
+ default=func.localtimestamp(),
139
+ server_default="NOW()",
140
+ )
141
+
142
+ embedding_model: Mapped["EmbeddingModel"] = relationship()
143
+ corpus: Mapped["Corpus"] = relationship()
144
+
145
+
146
+ class CorpusNClassifierModel(Base):
147
+ __tablename__ = "corpus_n_classifier_model"
148
+ __table_args__ = (
149
+ UniqueConstraint(
150
+ "corpus_id",
151
+ "n_classifier_model_id",
152
+ name="unique_corpus_n_classifier_association",
153
+ ),
154
+ {"schema": schema_name},
155
+ )
156
+
157
+ corpus_id = mapped_column(
158
+ types.Uuid,
159
+ ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.corpus.id"),
160
+ primary_key=True,
161
+ )
162
+ n_classifier_model_id = mapped_column(
163
+ types.Uuid,
164
+ ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.n_classifier_model.id"),
165
+ primary_key=True,
166
+ )
167
+
168
+ used_since: Mapped[datetime] = mapped_column(
169
+ TIMESTAMP(timezone=False),
170
+ nullable=False,
171
+ default=func.localtimestamp(),
172
+ server_default="NOW()",
173
+ )
174
+
175
+ n_classifier_model: Mapped["NClassifierModel"] = relationship()
176
+ corpus: Mapped["Corpus"] = relationship()
177
+
178
+
179
+ class CorpusBiClassifierModel(Base):
180
+ __tablename__ = "corpus_bi_classifier_model"
181
+ __table_args__ = (
182
+ UniqueConstraint(
183
+ "corpus_id",
184
+ "bi_classifier_model_id",
185
+ name="unique_corpus_bi_classifier_association",
186
+ ),
187
+ {"schema": schema_name},
188
+ )
189
+
190
+ corpus_id = mapped_column(
191
+ types.Uuid,
192
+ ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.corpus.id"),
193
+ primary_key=True,
194
+ )
195
+ bi_classifier_model_id = mapped_column(
196
+ types.Uuid,
197
+ ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.bi_classifier_model.id"),
198
+ primary_key=True,
199
+ )
200
+ used_since: Mapped[datetime] = mapped_column(
201
+ TIMESTAMP(timezone=False),
202
+ nullable=False,
203
+ default=func.localtimestamp(),
204
+ server_default="NOW()",
205
+ )
206
+
207
+ bi_classifier_model: Mapped["BiClassifierModel"] = relationship()
208
+ corpus: Mapped["Corpus"] = relationship()
@@ -8,13 +8,7 @@ from sqlalchemy import ForeignKey, Integer, LargeBinary, UniqueConstraint, func,
8
8
  from sqlalchemy.dialects.postgresql import ARRAY, ENUM, TIMESTAMP
9
9
  from sqlalchemy.orm import Mapped, mapped_column, relationship, validates
10
10
 
11
- from welearn_database.data.enumeration import (
12
- ContextType,
13
- Counter,
14
- DbSchemaEnum,
15
- ExternalIdType,
16
- Step,
17
- )
11
+ from welearn_database.data.enumeration import ContextType, Counter, DbSchemaEnum, Step
18
12
  from welearn_database.data.models import Base
19
13
  from welearn_database.data.models.corpus_related import (
20
14
  BiClassifierModel,
@@ -54,14 +48,6 @@ class WeLearnDocument(Base):
54
48
  id: Mapped[UUID] = mapped_column(
55
49
  types.Uuid, primary_key=True, nullable=False, server_default="gen_random_uuid()"
56
50
  )
57
- external_id: Mapped[str | None]
58
- external_id_type: Mapped[str | None] = mapped_column(
59
- ENUM(
60
- *(e.value.lower() for e in ExternalIdType),
61
- name="external_id_type",
62
- schema="document_related",
63
- ),
64
- )
65
51
  url: Mapped[str] = mapped_column(nullable=False)
66
52
  title: Mapped[str | None]
67
53
  lang: Mapped[str | None]
@@ -264,6 +250,41 @@ class ErrorRetrieval(Base):
264
250
  document: Mapped["WeLearnDocument"] = relationship()
265
251
 
266
252
 
253
+ class ErrorDataQuality(Base):
254
+ __tablename__ = "error_data_quality"
255
+ __table_args__ = ({"schema": schema_name},)
256
+
257
+ id: Mapped[UUID] = mapped_column(
258
+ types.Uuid, primary_key=True, nullable=False, server_default="gen_random_uuid()"
259
+ )
260
+ document_id: Mapped[UUID] = mapped_column(
261
+ types.Uuid,
262
+ ForeignKey(
263
+ f"{DbSchemaEnum.DOCUMENT_RELATED.value}.welearn_document.id",
264
+ name="error_data_quality_document_id_fkey",
265
+ ),
266
+ nullable=False,
267
+ )
268
+ slice_id: Mapped[UUID] = mapped_column(
269
+ types.Uuid,
270
+ ForeignKey(
271
+ f"{DbSchemaEnum.DOCUMENT_RELATED.value}.document_slice.id",
272
+ name="error_data_quality_slice_id_fkey",
273
+ ),
274
+ nullable=True,
275
+ )
276
+ error_raiser: Mapped[str]
277
+ error_info: Mapped[str]
278
+ created_at: Mapped[datetime] = mapped_column(
279
+ TIMESTAMP(timezone=False),
280
+ nullable=False,
281
+ default=func.localtimestamp(),
282
+ server_default="NOW()",
283
+ )
284
+ document: Mapped["WeLearnDocument"] = relationship()
285
+ slice: Mapped["DocumentSlice"] = relationship()
286
+
287
+
267
288
  class DocumentSlice(Base):
268
289
  __tablename__ = "document_slice"
269
290
  __table_args__ = {"schema": schema_name}
@@ -327,104 +348,6 @@ class AnalyticCounter(Base):
327
348
  document: Mapped["WeLearnDocument"] = relationship()
328
349
 
329
350
 
330
- class CorpusEmbeddingModel(Base):
331
- __tablename__ = "corpus_embedding_model"
332
- __table_args__ = (
333
- UniqueConstraint(
334
- "corpus_id",
335
- "embedding_model_id",
336
- name="unique_corpus_embedding_association",
337
- ),
338
- {"schema": schema_name},
339
- )
340
-
341
- corpus_id = mapped_column(
342
- types.Uuid,
343
- ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.corpus.id"),
344
- primary_key=True,
345
- )
346
- embedding_model_id = mapped_column(
347
- types.Uuid,
348
- ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.embedding_model.id"),
349
- primary_key=True,
350
- )
351
-
352
- used_since: Mapped[datetime] = mapped_column(
353
- TIMESTAMP(timezone=False),
354
- nullable=False,
355
- default=func.localtimestamp(),
356
- server_default="NOW()",
357
- )
358
-
359
- embedding_model: Mapped["EmbeddingModel"] = relationship()
360
- corpus: Mapped["Corpus"] = relationship()
361
-
362
-
363
- class CorpusNClassifierModel(Base):
364
- __tablename__ = "corpus_n_classifier_model"
365
- __table_args__ = (
366
- UniqueConstraint(
367
- "corpus_id",
368
- "n_classifier_model_id",
369
- name="unique_corpus_n_classifier_association",
370
- ),
371
- {"schema": schema_name},
372
- )
373
-
374
- corpus_id = mapped_column(
375
- types.Uuid,
376
- ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.corpus.id"),
377
- primary_key=True,
378
- )
379
- n_classifier_model_id = mapped_column(
380
- types.Uuid,
381
- ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.n_classifier_model.id"),
382
- primary_key=True,
383
- )
384
-
385
- used_since: Mapped[datetime] = mapped_column(
386
- TIMESTAMP(timezone=False),
387
- nullable=False,
388
- default=func.localtimestamp(),
389
- server_default="NOW()",
390
- )
391
-
392
- n_classifier_model: Mapped["NClassifierModel"] = relationship()
393
- corpus: Mapped["Corpus"] = relationship()
394
-
395
-
396
- class CorpusBiClassifierModel(Base):
397
- __tablename__ = "corpus_bi_classifier_model"
398
- __table_args__ = (
399
- UniqueConstraint(
400
- "corpus_id",
401
- "bi_classifier_model_id",
402
- name="unique_corpus_bi_classifier_association",
403
- ),
404
- {"schema": schema_name},
405
- )
406
-
407
- corpus_id = mapped_column(
408
- types.Uuid,
409
- ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.corpus.id"),
410
- primary_key=True,
411
- )
412
- bi_classifier_model_id = mapped_column(
413
- types.Uuid,
414
- ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.bi_classifier_model.id"),
415
- primary_key=True,
416
- )
417
- used_since: Mapped[datetime] = mapped_column(
418
- TIMESTAMP(timezone=False),
419
- nullable=False,
420
- default=func.localtimestamp(),
421
- server_default="NOW()",
422
- )
423
-
424
- bi_classifier_model: Mapped["BiClassifierModel"] = relationship()
425
- corpus: Mapped["Corpus"] = relationship()
426
-
427
-
428
351
  class Sdg(Base):
429
352
  __tablename__ = "sdg"
430
353
  __table_args__ = {"schema": schema_name}