contextbase-plugin-chrome-local 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextbase_plugin_chrome_local-0.2.4.dist-info/METADATA +13 -0
- contextbase_plugin_chrome_local-0.2.4.dist-info/RECORD +17 -0
- contextbase_plugin_chrome_local-0.2.4.dist-info/WHEEL +4 -0
- plugin_chrome_local/__init__.py +0 -0
- plugin_chrome_local/binding_config.py +50 -0
- plugin_chrome_local/component.py +120 -0
- plugin_chrome_local/defs/__init__.py +0 -0
- plugin_chrome_local/defs/defs.yaml +1 -0
- plugin_chrome_local/models/__init__.py +0 -0
- plugin_chrome_local/models/base.py +7 -0
- plugin_chrome_local/models/ctx.py +27 -0
- plugin_chrome_local/models/ingress.py +56 -0
- plugin_chrome_local/models/translators.py +68 -0
- plugin_chrome_local/models/types.py +3 -0
- plugin_chrome_local/plugin.json +7 -0
- plugin_chrome_local/sources/__init__.py +0 -0
- plugin_chrome_local/sources/snapshot.py +338 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: contextbase-plugin-chrome-local
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary: Chrome local plugin for ContextBase
|
|
5
|
+
Author: Alizain Feerasta
|
|
6
|
+
Author-email: Alizain Feerasta <alizain.feerasta@gmail.com>
|
|
7
|
+
Requires-Dist: contextbase-shared-plugins==0.2.4
|
|
8
|
+
Requires-Dist: dagster==1.12.14
|
|
9
|
+
Requires-Dist: dagster-dlt==0.28.14
|
|
10
|
+
Requires-Dist: dlt>=1.26.0
|
|
11
|
+
Requires-Dist: pydantic>=2.12.0
|
|
12
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
13
|
+
Requires-Python: >=3.14, <3.15
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
plugin_chrome_local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
plugin_chrome_local/binding_config.py,sha256=p_l7z-2oCMivd_8-H6T9uReOJUslfAsR8O6WBd7NHS0,1498
|
|
3
|
+
plugin_chrome_local/component.py,sha256=a542jdao0HU-BHgW6MbD2O0hPKnwgSurzAgiXqXxbxM,4063
|
|
4
|
+
plugin_chrome_local/defs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
plugin_chrome_local/defs/defs.yaml,sha256=IYKSc6UTq-426pzOmauAZbwFrMJJB1XplhDGIbEW850,61
|
|
6
|
+
plugin_chrome_local/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
plugin_chrome_local/models/base.py,sha256=Fp7n4EfxQOxzqm34-D-rYyu60P88eZUIewCBNP_Iuc4,119
|
|
8
|
+
plugin_chrome_local/models/ctx.py,sha256=ugppXE8Y2Cux9yz26AqzBLY58Q_kX8W7QZ910jhY1cQ,754
|
|
9
|
+
plugin_chrome_local/models/ingress.py,sha256=fU1WpaRUijJKF3cA4-sGXEmIt2uFYilcr6BIoxlAkck,1588
|
|
10
|
+
plugin_chrome_local/models/translators.py,sha256=Q70Xkb54ZB8T4lecklRDypxbKGlxuVlY_bDkPtt6JDs,2010
|
|
11
|
+
plugin_chrome_local/models/types.py,sha256=u2Y5SHZdK-_SFOg254UDtAD7tdZ5FjnbY6tsxjlp0ys,68
|
|
12
|
+
plugin_chrome_local/plugin.json,sha256=ntgpILrXGIkOnaeWMSgFv_DAS58MVKG4A3yfr3DkzDQ,85
|
|
13
|
+
plugin_chrome_local/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
plugin_chrome_local/sources/snapshot.py,sha256=kgVe6kx86vna-FCiEiAteaNnCjL2twy5grB8wU4TL8g,10477
|
|
15
|
+
contextbase_plugin_chrome_local-0.2.4.dist-info/WHEEL,sha256=i9aSRDivn5iP9LaR1BLQX2GNAuriQWPsFwbbWygTX2k,81
|
|
16
|
+
contextbase_plugin_chrome_local-0.2.4.dist-info/METADATA,sha256=y5jVuyAhKZjWQc0GLaU2nQckGbm5rYWNrF4TqxLRKEo,445
|
|
17
|
+
contextbase_plugin_chrome_local-0.2.4.dist-info/RECORD,,
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, model_validator
|
|
6
|
+
|
|
7
|
+
from shared_plugins.bindings import BaseBindingConfigModel, NonEmptyText, ResolvedPath
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ChromeLocalDataDir(BaseBindingConfigModel):
|
|
11
|
+
name: NonEmptyText
|
|
12
|
+
path: ResolvedPath
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ChromeLocalBindingConfig(BaseBindingConfigModel):
|
|
16
|
+
data_dirs: list[ChromeLocalDataDir] = Field(
|
|
17
|
+
default_factory=lambda: [
|
|
18
|
+
ChromeLocalDataDir(
|
|
19
|
+
name="chrome",
|
|
20
|
+
path=Path.home()
|
|
21
|
+
/ "Library"
|
|
22
|
+
/ "Application Support"
|
|
23
|
+
/ "Google"
|
|
24
|
+
/ "Chrome",
|
|
25
|
+
),
|
|
26
|
+
ChromeLocalDataDir(
|
|
27
|
+
name="chrome_canary",
|
|
28
|
+
path=Path.home()
|
|
29
|
+
/ "Library"
|
|
30
|
+
/ "Application Support"
|
|
31
|
+
/ "Google"
|
|
32
|
+
/ "Chrome Canary",
|
|
33
|
+
),
|
|
34
|
+
],
|
|
35
|
+
min_length=1,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
@model_validator(mode="after")
|
|
39
|
+
def _validate_unique_names(self) -> ChromeLocalBindingConfig:
|
|
40
|
+
duplicate_names = tuple(
|
|
41
|
+
name
|
|
42
|
+
for name in dict.fromkeys(data_dir.name for data_dir in self.data_dirs)
|
|
43
|
+
if sum(data_dir.name == name for data_dir in self.data_dirs) > 1
|
|
44
|
+
)
|
|
45
|
+
if duplicate_names:
|
|
46
|
+
duplicates = ", ".join(duplicate_names)
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"data_dirs names must be unique; duplicates: {duplicates}."
|
|
49
|
+
)
|
|
50
|
+
return self
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import dagster as dg
|
|
2
|
+
from dagster import AssetExecutionContext
|
|
3
|
+
from dagster_dlt import DagsterDltResource
|
|
4
|
+
from shared_plugins.automation import non_overlapping_automation_condition
|
|
5
|
+
from shared_plugins.bindings import (
|
|
6
|
+
parse_binding_config,
|
|
7
|
+
resolve_binding_models,
|
|
8
|
+
)
|
|
9
|
+
from shared_plugins.control_plane import ControlPlaneClient
|
|
10
|
+
from shared_plugins.dlt import resolve_partition_binding, run_dlt_pipeline
|
|
11
|
+
from shared_plugins.naming import (
|
|
12
|
+
dagster_asset_group_name,
|
|
13
|
+
dagster_asset_tags,
|
|
14
|
+
dagster_dlt_asset_key,
|
|
15
|
+
dagster_partition_def_name,
|
|
16
|
+
dagster_pool_name,
|
|
17
|
+
dlt_source_name,
|
|
18
|
+
plugin_id_from_module,
|
|
19
|
+
)
|
|
20
|
+
from shared_plugins.resources import DLT_RESOURCE
|
|
21
|
+
|
|
22
|
+
from .binding_config import ChromeLocalBindingConfig
|
|
23
|
+
from .sources.snapshot import SUPPORTED_MODEL_NAMES, chrome_local_snapshot_source
|
|
24
|
+
|
|
25
|
+
PLUGIN_ID = plugin_id_from_module(__file__)
|
|
26
|
+
SNAPSHOT_JOB = "snapshot"
|
|
27
|
+
SNAPSHOT_SOURCE_NAME = dlt_source_name(PLUGIN_ID, SNAPSHOT_JOB)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _build_snapshot_specs(
|
|
31
|
+
partitions_def: dg.PartitionsDefinition,
|
|
32
|
+
automation_condition: dg.AutomationCondition,
|
|
33
|
+
) -> list[dg.AssetSpec]:
|
|
34
|
+
shared = dict(
|
|
35
|
+
group_name=dagster_asset_group_name(PLUGIN_ID),
|
|
36
|
+
tags=dagster_asset_tags(PLUGIN_ID),
|
|
37
|
+
automation_condition=automation_condition,
|
|
38
|
+
partitions_def=partitions_def,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
visit_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "visit")
|
|
42
|
+
bookmark_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "bookmark")
|
|
43
|
+
|
|
44
|
+
return [
|
|
45
|
+
dg.AssetSpec(
|
|
46
|
+
key=visit_key,
|
|
47
|
+
**shared,
|
|
48
|
+
),
|
|
49
|
+
dg.AssetSpec(
|
|
50
|
+
key=bookmark_key,
|
|
51
|
+
**shared,
|
|
52
|
+
),
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ChromeLocalSyncComponent(dg.Component):
|
|
57
|
+
def build_defs(self, context: dg.ComponentLoadContext) -> dg.Definitions:
|
|
58
|
+
partitions_def = dg.DynamicPartitionsDefinition(
|
|
59
|
+
name=dagster_partition_def_name(PLUGIN_ID)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
snapshot_specs = _build_snapshot_specs(
|
|
63
|
+
partitions_def=partitions_def,
|
|
64
|
+
automation_condition=non_overlapping_automation_condition(
|
|
65
|
+
dg.AutomationCondition.on_missing()
|
|
66
|
+
| dg.AutomationCondition.on_cron("*/15 * * * *")
|
|
67
|
+
),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@dg.multi_asset(
|
|
71
|
+
specs=snapshot_specs,
|
|
72
|
+
can_subset=True,
|
|
73
|
+
name="chrome_local_snapshot",
|
|
74
|
+
pool=dagster_pool_name(PLUGIN_ID),
|
|
75
|
+
)
|
|
76
|
+
def chrome_local_snapshot_assets(
|
|
77
|
+
context: AssetExecutionContext,
|
|
78
|
+
dlt_resource: DagsterDltResource,
|
|
79
|
+
control_plane: dg.ResourceParam[ControlPlaneClient],
|
|
80
|
+
):
|
|
81
|
+
binding = resolve_partition_binding(
|
|
82
|
+
context=context,
|
|
83
|
+
control_plane=control_plane,
|
|
84
|
+
plugin_id=PLUGIN_ID,
|
|
85
|
+
)
|
|
86
|
+
binding_id = str(binding.binding_id)
|
|
87
|
+
cfg = parse_binding_config(binding, ChromeLocalBindingConfig)
|
|
88
|
+
binding_models = resolve_binding_models(
|
|
89
|
+
binding,
|
|
90
|
+
supported_models=SUPPORTED_MODEL_NAMES,
|
|
91
|
+
default_active=SUPPORTED_MODEL_NAMES,
|
|
92
|
+
)
|
|
93
|
+
source = chrome_local_snapshot_source(
|
|
94
|
+
binding_id,
|
|
95
|
+
cfg,
|
|
96
|
+
binding_models=binding_models,
|
|
97
|
+
)
|
|
98
|
+
yield from run_dlt_pipeline(
|
|
99
|
+
context=context,
|
|
100
|
+
dlt_resource=dlt_resource,
|
|
101
|
+
source=source,
|
|
102
|
+
plugin_id=PLUGIN_ID,
|
|
103
|
+
binding_id=binding_id,
|
|
104
|
+
job_name=SNAPSHOT_JOB,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
automation_sensor = dg.AutomationConditionSensorDefinition(
|
|
108
|
+
name="chrome_local_automation_sensor",
|
|
109
|
+
target=dg.AssetSelection.assets(chrome_local_snapshot_assets),
|
|
110
|
+
default_status=dg.DefaultSensorStatus.RUNNING,
|
|
111
|
+
minimum_interval_seconds=30,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return dg.Definitions(
|
|
115
|
+
assets=[chrome_local_snapshot_assets],
|
|
116
|
+
sensors=[automation_sensor],
|
|
117
|
+
resources={
|
|
118
|
+
"dlt_resource": DLT_RESOURCE,
|
|
119
|
+
},
|
|
120
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
type: plugin_chrome_local.component.ChromeLocalSyncComponent
|
|
File without changes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import AwareDatetime
|
|
4
|
+
from shared_plugins.models import CtxModel, IdStr, NonNegativeInt
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class VisitRow(CtxModel):
|
|
8
|
+
profile: IdStr
|
|
9
|
+
visit_id: NonNegativeInt
|
|
10
|
+
url: str | None = None
|
|
11
|
+
title: str | None = None
|
|
12
|
+
visit_time: AwareDatetime | None = None
|
|
13
|
+
visit_duration_ms: NonNegativeInt | None = None
|
|
14
|
+
url_visit_count: NonNegativeInt | None = None
|
|
15
|
+
url_typed_count: NonNegativeInt | None = None
|
|
16
|
+
transition_type: int | None = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BookmarkRow(CtxModel):
|
|
20
|
+
profile: IdStr
|
|
21
|
+
id: IdStr
|
|
22
|
+
guid: str | None = None
|
|
23
|
+
name: str | None = None
|
|
24
|
+
url: IdStr
|
|
25
|
+
folder_path: IdStr
|
|
26
|
+
date_added: AwareDatetime | None = None
|
|
27
|
+
date_last_used: AwareDatetime | None = None
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, StrictStr
|
|
7
|
+
from shared_plugins.models import IdStr, IngressModel
|
|
8
|
+
from shared_plugins.sqlalchemy_types import ChromiumTimestamp
|
|
9
|
+
from sqlalchemy import ForeignKey, String
|
|
10
|
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|
11
|
+
|
|
12
|
+
from .base import Base
|
|
13
|
+
from .types import RawTimestamp
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ChromeProfileIngress(IngressModel):
|
|
17
|
+
name: IdStr
|
|
18
|
+
dir_path: IdStr
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ChromeUrl(Base):
|
|
22
|
+
"""Row from urls table in Chrome History database."""
|
|
23
|
+
|
|
24
|
+
__tablename__ = "urls"
|
|
25
|
+
|
|
26
|
+
id: Mapped[int] = mapped_column(primary_key=True)
|
|
27
|
+
url: Mapped[str | None] = mapped_column(String)
|
|
28
|
+
title: Mapped[str | None] = mapped_column(String)
|
|
29
|
+
visit_count: Mapped[int | None]
|
|
30
|
+
typed_count: Mapped[int | None]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ChromeVisit(Base):
|
|
34
|
+
"""Row from visits table in Chrome History database."""
|
|
35
|
+
|
|
36
|
+
__tablename__ = "visits"
|
|
37
|
+
|
|
38
|
+
id: Mapped[int] = mapped_column(primary_key=True)
|
|
39
|
+
url_id: Mapped[int] = mapped_column("url", ForeignKey("urls.id"))
|
|
40
|
+
visit_time: Mapped[datetime | None] = mapped_column(ChromiumTimestamp())
|
|
41
|
+
visit_duration: Mapped[int | None]
|
|
42
|
+
transition: Mapped[int | None]
|
|
43
|
+
|
|
44
|
+
url_entry: Mapped[ChromeUrl] = relationship()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ChromeBookmarkIngress(IngressModel):
|
|
48
|
+
type: Literal["url"]
|
|
49
|
+
profile: IdStr
|
|
50
|
+
id: IdStr
|
|
51
|
+
guid: StrictStr | None = None
|
|
52
|
+
name: StrictStr | None = None
|
|
53
|
+
url: StrictStr = Field(min_length=1)
|
|
54
|
+
folder_path: IdStr
|
|
55
|
+
date_added: RawTimestamp = None
|
|
56
|
+
date_last_used: RawTimestamp = None
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Iterator
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
|
|
6
|
+
from shared_plugins.sqlalchemy_types import CHROMIUM_EPOCH
|
|
7
|
+
|
|
8
|
+
from .ctx import BookmarkRow, VisitRow
|
|
9
|
+
from .ingress import ChromeBookmarkIngress, ChromeVisit
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _chromium_to_datetime(
|
|
13
|
+
*,
|
|
14
|
+
value: int | str | None,
|
|
15
|
+
) -> datetime | None:
|
|
16
|
+
if value is None:
|
|
17
|
+
return None
|
|
18
|
+
|
|
19
|
+
return CHROMIUM_EPOCH + timedelta(microseconds=int(value))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def visits_to_ctx_models(
|
|
23
|
+
*,
|
|
24
|
+
binding_id: str,
|
|
25
|
+
profile: str,
|
|
26
|
+
visits: Iterable[ChromeVisit],
|
|
27
|
+
) -> Iterator[VisitRow]:
|
|
28
|
+
for visit in visits:
|
|
29
|
+
yield VisitRow(
|
|
30
|
+
ctx_binding_id=binding_id,
|
|
31
|
+
ctx_source_updated_at=visit.visit_time,
|
|
32
|
+
profile=profile,
|
|
33
|
+
visit_id=visit.id,
|
|
34
|
+
url=visit.url_entry.url,
|
|
35
|
+
title=visit.url_entry.title,
|
|
36
|
+
visit_time=visit.visit_time,
|
|
37
|
+
visit_duration_ms=visit.visit_duration,
|
|
38
|
+
url_visit_count=visit.url_entry.visit_count,
|
|
39
|
+
url_typed_count=visit.url_entry.typed_count,
|
|
40
|
+
transition_type=visit.transition,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def bookmarks_to_ctx_models(
|
|
45
|
+
*,
|
|
46
|
+
binding_id: str,
|
|
47
|
+
bookmarks: Iterable[ChromeBookmarkIngress],
|
|
48
|
+
) -> Iterator[BookmarkRow]:
|
|
49
|
+
for bookmark in bookmarks:
|
|
50
|
+
date_added = _chromium_to_datetime(
|
|
51
|
+
value=bookmark.date_added,
|
|
52
|
+
)
|
|
53
|
+
date_last_used = _chromium_to_datetime(
|
|
54
|
+
value=bookmark.date_last_used,
|
|
55
|
+
)
|
|
56
|
+
source_updated_at = date_last_used if date_last_used is not None else date_added
|
|
57
|
+
yield BookmarkRow(
|
|
58
|
+
ctx_binding_id=binding_id,
|
|
59
|
+
ctx_source_updated_at=source_updated_at,
|
|
60
|
+
profile=bookmark.profile,
|
|
61
|
+
id=bookmark.id,
|
|
62
|
+
guid=bookmark.guid,
|
|
63
|
+
name=bookmark.name,
|
|
64
|
+
url=bookmark.url,
|
|
65
|
+
folder_path=bookmark.folder_path,
|
|
66
|
+
date_added=date_added,
|
|
67
|
+
date_last_used=date_last_used,
|
|
68
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import dlt
|
|
11
|
+
from pydantic import ValidationError
|
|
12
|
+
from shared_plugins.bindings import ResolvedBindingModels, iter_active_model_rows
|
|
13
|
+
from shared_plugins.models import format_validation_error
|
|
14
|
+
from shared_plugins.naming import (
|
|
15
|
+
dlt_resource_name,
|
|
16
|
+
dlt_source_name,
|
|
17
|
+
plugin_id_from_module,
|
|
18
|
+
)
|
|
19
|
+
from shared_plugins.resources import ctx_dlt_resource
|
|
20
|
+
from shared_plugins.sqlite import sqlite_snapshot
|
|
21
|
+
from sqlalchemy import create_engine, select
|
|
22
|
+
from sqlalchemy.orm import Session, joinedload
|
|
23
|
+
|
|
24
|
+
from ..binding_config import ChromeLocalBindingConfig, ChromeLocalDataDir
|
|
25
|
+
from ..models.ctx import BookmarkRow, VisitRow
|
|
26
|
+
from ..models.ingress import (
|
|
27
|
+
ChromeBookmarkIngress,
|
|
28
|
+
ChromeProfileIngress,
|
|
29
|
+
ChromeVisit,
|
|
30
|
+
)
|
|
31
|
+
from ..models.translators import bookmarks_to_ctx_models, visits_to_ctx_models
|
|
32
|
+
|
|
33
|
+
PLUGIN_ID = plugin_id_from_module(__file__)
|
|
34
|
+
JOB = "snapshot"
|
|
35
|
+
LOGGER = logging.getLogger(__name__)
|
|
36
|
+
SUPPORTED_MODEL_NAMES = ("visit", "bookmark")
|
|
37
|
+
ROOT_FOLDERS = ("bookmark_bar", "other", "synced")
|
|
38
|
+
MERGE_WRITE_DISPOSITION = {"disposition": "merge", "strategy": "delete-insert"}
|
|
39
|
+
MERGE_KEY = ("_ctx_binding_id",)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class BookmarkSkip:
|
|
44
|
+
profile: str
|
|
45
|
+
id: object
|
|
46
|
+
guid: object
|
|
47
|
+
name: object
|
|
48
|
+
url: object
|
|
49
|
+
folder_path: str
|
|
50
|
+
reason: str
|
|
51
|
+
validation_error: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def discover_profiles(
|
|
55
|
+
data_dirs: Sequence[ChromeLocalDataDir],
|
|
56
|
+
) -> list[ChromeProfileIngress]:
|
|
57
|
+
"""Scan the configured Chromium data directories for profile subdirectories."""
|
|
58
|
+
profiles: list[ChromeProfileIngress] = []
|
|
59
|
+
for data_dir in data_dirs:
|
|
60
|
+
browser_prefix = data_dir.name
|
|
61
|
+
data_dir_path = data_dir.path
|
|
62
|
+
if not data_dir_path.is_dir():
|
|
63
|
+
continue
|
|
64
|
+
for child in sorted(data_dir_path.iterdir()):
|
|
65
|
+
if not child.is_dir():
|
|
66
|
+
continue
|
|
67
|
+
has_history = (child / "History").is_file()
|
|
68
|
+
has_bookmarks = (child / "Bookmarks").is_file()
|
|
69
|
+
if has_history or has_bookmarks:
|
|
70
|
+
profiles.append(
|
|
71
|
+
ChromeProfileIngress.model_validate(
|
|
72
|
+
{
|
|
73
|
+
"name": f"{browser_prefix}/{child.name}",
|
|
74
|
+
"dir_path": str(child),
|
|
75
|
+
}
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
if not profiles:
|
|
80
|
+
searched = ", ".join(str(d.path) for d in data_dirs)
|
|
81
|
+
raise RuntimeError(
|
|
82
|
+
f"No Chrome profiles found. "
|
|
83
|
+
f"Searched for subdirectories containing History or Bookmarks in: {searched}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return profiles
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _iter_profile_visits(profile: ChromeProfileIngress) -> Iterator[ChromeVisit]:
|
|
90
|
+
"""Yield visit records from a single Chrome profile's History database."""
|
|
91
|
+
history_path = Path(profile.dir_path) / "History"
|
|
92
|
+
if not history_path.is_file():
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
with sqlite_snapshot(history_path) as snapshot_path:
|
|
96
|
+
engine = create_engine(f"sqlite:///{snapshot_path}")
|
|
97
|
+
try:
|
|
98
|
+
with Session(engine) as session:
|
|
99
|
+
query = select(ChromeVisit).options(joinedload(ChromeVisit.url_entry))
|
|
100
|
+
yield from session.scalars(query).unique()
|
|
101
|
+
finally:
|
|
102
|
+
engine.dispose()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def append_folder(folder_path: str, child_folder_name: str | None) -> str:
|
|
106
|
+
if not child_folder_name:
|
|
107
|
+
return folder_path
|
|
108
|
+
|
|
109
|
+
return f"{folder_path}/{child_folder_name}"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _to_bookmark_record_candidate(
|
|
113
|
+
*,
|
|
114
|
+
profile_name: str,
|
|
115
|
+
node: Mapping[str, object],
|
|
116
|
+
folder_path: str,
|
|
117
|
+
) -> ChromeBookmarkIngress | BookmarkSkip:
|
|
118
|
+
candidate = dict(node)
|
|
119
|
+
candidate["profile"] = profile_name
|
|
120
|
+
candidate["folder_path"] = folder_path
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
return ChromeBookmarkIngress.model_validate(candidate)
|
|
124
|
+
except ValidationError as exc:
|
|
125
|
+
message = format_validation_error(exc)
|
|
126
|
+
if "url" in message:
|
|
127
|
+
return BookmarkSkip(
|
|
128
|
+
profile=profile_name,
|
|
129
|
+
id=node.get("id"),
|
|
130
|
+
guid=node.get("guid"),
|
|
131
|
+
name=node.get("name"),
|
|
132
|
+
url=node.get("url"),
|
|
133
|
+
folder_path=folder_path,
|
|
134
|
+
reason="invalid_url",
|
|
135
|
+
validation_error=message,
|
|
136
|
+
)
|
|
137
|
+
raise
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _log_bookmark_skip(skip: BookmarkSkip) -> None:
|
|
141
|
+
LOGGER.warning(
|
|
142
|
+
"chrome_local.bookmark_skipped %s",
|
|
143
|
+
json.dumps(
|
|
144
|
+
{
|
|
145
|
+
"profile": skip.profile,
|
|
146
|
+
"id": skip.id,
|
|
147
|
+
"guid": skip.guid,
|
|
148
|
+
"name": skip.name,
|
|
149
|
+
"url": skip.url,
|
|
150
|
+
"folder_path": skip.folder_path,
|
|
151
|
+
"reason": skip.reason,
|
|
152
|
+
"validation_error": skip.validation_error,
|
|
153
|
+
},
|
|
154
|
+
sort_keys=True,
|
|
155
|
+
default=str,
|
|
156
|
+
),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _walk_bookmark_node(
|
|
161
|
+
*,
|
|
162
|
+
profile_name: str,
|
|
163
|
+
node_value: object,
|
|
164
|
+
folder_path: str,
|
|
165
|
+
skipped_bookmarks: list[BookmarkSkip],
|
|
166
|
+
) -> Iterator[ChromeBookmarkIngress]:
|
|
167
|
+
if not isinstance(node_value, Mapping):
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
node_type = node_value.get("type")
|
|
171
|
+
if node_type == "url":
|
|
172
|
+
bookmark_result = _to_bookmark_record_candidate(
|
|
173
|
+
profile_name=profile_name,
|
|
174
|
+
node=node_value,
|
|
175
|
+
folder_path=folder_path,
|
|
176
|
+
)
|
|
177
|
+
if isinstance(bookmark_result, BookmarkSkip):
|
|
178
|
+
_log_bookmark_skip(bookmark_result)
|
|
179
|
+
skipped_bookmarks.append(bookmark_result)
|
|
180
|
+
return
|
|
181
|
+
yield bookmark_result
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
if node_type != "folder":
|
|
185
|
+
return
|
|
186
|
+
|
|
187
|
+
children = node_value.get("children")
|
|
188
|
+
if not isinstance(children, list):
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
for child in children:
|
|
192
|
+
if not isinstance(child, Mapping):
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
child_type = child.get("type")
|
|
196
|
+
if child_type == "folder":
|
|
197
|
+
child_name = child.get("name")
|
|
198
|
+
next_folder_path = append_folder(
|
|
199
|
+
folder_path,
|
|
200
|
+
child_name if isinstance(child_name, str) else None,
|
|
201
|
+
)
|
|
202
|
+
yield from _walk_bookmark_node(
|
|
203
|
+
profile_name=profile_name,
|
|
204
|
+
node_value=child,
|
|
205
|
+
folder_path=next_folder_path,
|
|
206
|
+
skipped_bookmarks=skipped_bookmarks,
|
|
207
|
+
)
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
if child_type == "url":
|
|
211
|
+
bookmark_result = _to_bookmark_record_candidate(
|
|
212
|
+
profile_name=profile_name,
|
|
213
|
+
node=child,
|
|
214
|
+
folder_path=folder_path,
|
|
215
|
+
)
|
|
216
|
+
if isinstance(bookmark_result, BookmarkSkip):
|
|
217
|
+
_log_bookmark_skip(bookmark_result)
|
|
218
|
+
skipped_bookmarks.append(bookmark_result)
|
|
219
|
+
continue
|
|
220
|
+
yield bookmark_result
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _load_profile_bookmarks(
|
|
224
|
+
profile: ChromeProfileIngress,
|
|
225
|
+
) -> tuple[list[ChromeBookmarkIngress], list[BookmarkSkip]]:
|
|
226
|
+
bookmarks_path = Path(profile.dir_path) / "Bookmarks"
|
|
227
|
+
|
|
228
|
+
if not bookmarks_path.is_file():
|
|
229
|
+
return [], []
|
|
230
|
+
|
|
231
|
+
raw = bookmarks_path.read_text(encoding="utf-8")
|
|
232
|
+
parsed = json.loads(raw)
|
|
233
|
+
|
|
234
|
+
if not isinstance(parsed, Mapping):
|
|
235
|
+
raise RuntimeError("Chrome Bookmarks root must be a JSON object.")
|
|
236
|
+
|
|
237
|
+
roots = parsed.get("roots")
|
|
238
|
+
if not isinstance(roots, Mapping):
|
|
239
|
+
raise RuntimeError(
|
|
240
|
+
"Chrome Bookmarks JSON missing 'roots'; cannot extract bookmarks."
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
bookmarks: list[ChromeBookmarkIngress] = []
|
|
244
|
+
skipped_bookmarks: list[BookmarkSkip] = []
|
|
245
|
+
for root_folder in ROOT_FOLDERS:
|
|
246
|
+
root_folder_node = roots.get(root_folder)
|
|
247
|
+
if root_folder_node is None:
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
bookmarks.extend(
|
|
251
|
+
_walk_bookmark_node(
|
|
252
|
+
profile_name=profile.name,
|
|
253
|
+
node_value=root_folder_node,
|
|
254
|
+
folder_path=root_folder,
|
|
255
|
+
skipped_bookmarks=skipped_bookmarks,
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return bookmarks, skipped_bookmarks
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _scan_chrome_bookmarks(
|
|
263
|
+
data_dirs: Sequence[ChromeLocalDataDir],
|
|
264
|
+
) -> list[ChromeBookmarkIngress]:
|
|
265
|
+
profiles = discover_profiles(data_dirs)
|
|
266
|
+
bookmarks: list[ChromeBookmarkIngress] = []
|
|
267
|
+
skipped_bookmarks: list[BookmarkSkip] = []
|
|
268
|
+
|
|
269
|
+
for profile in profiles:
|
|
270
|
+
profile_bookmarks, profile_skips = _load_profile_bookmarks(profile)
|
|
271
|
+
bookmarks.extend(profile_bookmarks)
|
|
272
|
+
skipped_bookmarks.extend(profile_skips)
|
|
273
|
+
|
|
274
|
+
LOGGER.info(
|
|
275
|
+
"chrome_local.bookmarks_discovered profiles=%d bookmarks=%d skipped_bookmarks=%d",
|
|
276
|
+
len(profiles),
|
|
277
|
+
len(bookmarks),
|
|
278
|
+
len(skipped_bookmarks),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return bookmarks
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
|
|
285
|
+
def chrome_local_snapshot_source(
|
|
286
|
+
binding_id: str,
|
|
287
|
+
cfg: ChromeLocalBindingConfig,
|
|
288
|
+
*,
|
|
289
|
+
binding_models: ResolvedBindingModels,
|
|
290
|
+
) -> tuple[Any, ...]:
|
|
291
|
+
"""Chrome local profile snapshot for one binding."""
|
|
292
|
+
data_dirs = cfg.data_dirs
|
|
293
|
+
|
|
294
|
+
@ctx_dlt_resource(
|
|
295
|
+
name=dlt_resource_name("visit"),
|
|
296
|
+
write_disposition=MERGE_WRITE_DISPOSITION,
|
|
297
|
+
merge_key=MERGE_KEY,
|
|
298
|
+
primary_key=("_ctx_binding_id", "profile", "visit_id"),
|
|
299
|
+
)
|
|
300
|
+
def visit_resource() -> Iterator[VisitRow]:
|
|
301
|
+
if "visit" not in binding_models.active:
|
|
302
|
+
return
|
|
303
|
+
|
|
304
|
+
profiles = discover_profiles(data_dirs)
|
|
305
|
+
for profile in profiles:
|
|
306
|
+
yield from iter_active_model_rows(
|
|
307
|
+
model_name="visit",
|
|
308
|
+
rows=visits_to_ctx_models(
|
|
309
|
+
binding_id=binding_id,
|
|
310
|
+
profile=profile.name,
|
|
311
|
+
visits=_iter_profile_visits(profile),
|
|
312
|
+
),
|
|
313
|
+
binding_models=binding_models,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
@ctx_dlt_resource(
|
|
317
|
+
name=dlt_resource_name("bookmark"),
|
|
318
|
+
write_disposition=MERGE_WRITE_DISPOSITION,
|
|
319
|
+
merge_key=MERGE_KEY,
|
|
320
|
+
primary_key=("_ctx_binding_id", "profile", "id"),
|
|
321
|
+
)
|
|
322
|
+
def bookmark_resource() -> Iterator[BookmarkRow]:
|
|
323
|
+
if "bookmark" not in binding_models.active:
|
|
324
|
+
return
|
|
325
|
+
|
|
326
|
+
yield from iter_active_model_rows(
|
|
327
|
+
model_name="bookmark",
|
|
328
|
+
rows=bookmarks_to_ctx_models(
|
|
329
|
+
binding_id=binding_id,
|
|
330
|
+
bookmarks=_scan_chrome_bookmarks(data_dirs),
|
|
331
|
+
),
|
|
332
|
+
binding_models=binding_models,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
return (
|
|
336
|
+
visit_resource,
|
|
337
|
+
bookmark_resource,
|
|
338
|
+
)
|