logseq-retriever 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logseq_retriever/__init__.py +0 -0
- logseq_retriever/loaders/__init__.py +15 -0
- logseq_retriever/loaders/journal_document_metadata.py +32 -0
- logseq_retriever/loaders/journal_filesystem_loader.py +157 -0
- logseq_retriever/loaders/journal_loader.py +13 -0
- logseq_retriever/loaders/journal_loader_input.py +108 -0
- logseq_retriever/models/__init__.py +15 -0
- logseq_retriever/models/journal_pgvector.py +107 -0
- logseq_retriever/retrievers/__init__.py +19 -0
- logseq_retriever/retrievers/contextualizer.py +172 -0
- logseq_retriever/retrievers/journal_date_range_retriever.py +79 -0
- logseq_retriever/retrievers/journal_retriever.py +105 -0
- logseq_retriever/retrievers/pgvector_journal_retriever.py +95 -0
- logseq_retriever/uploaders/__init__.py +0 -0
- logseq_retriever/uploaders/pgvector/__init__.py +9 -0
- logseq_retriever/uploaders/pgvector/journal_corpus_manager.py +72 -0
- logseq_retriever-0.4.1.dist-info/METADATA +102 -0
- logseq_retriever-0.4.1.dist-info/RECORD +21 -0
- logseq_retriever-0.4.1.dist-info/WHEEL +5 -0
- logseq_retriever-0.4.1.dist-info/licenses/LICENSE +21 -0
- logseq_retriever-0.4.1.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from logseq_retriever.loaders.journal_document_metadata import (
|
|
2
|
+
LogseqJournalDocumentMetadata,
|
|
3
|
+
)
|
|
4
|
+
from logseq_retriever.loaders.journal_filesystem_loader import (
|
|
5
|
+
LogseqJournalFilesystemLoader,
|
|
6
|
+
)
|
|
7
|
+
from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
|
|
8
|
+
from logseq_retriever.loaders.journal_loader import LogseqJournalLoader
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"LogseqJournalDocumentMetadata",
|
|
12
|
+
"LogseqJournalFilesystemLoader",
|
|
13
|
+
"LogseqJournalLoaderInput",
|
|
14
|
+
"LogseqJournalLoader",
|
|
15
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LogseqJournalDocumentMetadata(BaseModel):
|
|
7
|
+
"""
|
|
8
|
+
Metadata for a Logseq journal `Document`.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
journal_date: Annotated[
|
|
12
|
+
str,
|
|
13
|
+
Field(
|
|
14
|
+
description="The date of the journal entry, in YYYY-MM-DD format.",
|
|
15
|
+
examples=["2023-01-01", "2025-06-09"],
|
|
16
|
+
),
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
journal_tags: Annotated[
|
|
20
|
+
list[str],
|
|
21
|
+
Field(
|
|
22
|
+
description="The tags associated with the journal entry.",
|
|
23
|
+
examples=[["tag1", "tag2"], ["tag3"]],
|
|
24
|
+
),
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
journal_char_count: Annotated[
|
|
28
|
+
int,
|
|
29
|
+
Field(
|
|
30
|
+
description="The number of characters in the journal entry.",
|
|
31
|
+
),
|
|
32
|
+
]
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
from logging import getLogger
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from logseq_retriever.loaders.journal_loader import LogseqJournalLoader
|
|
7
|
+
from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
|
|
8
|
+
from logseq_retriever.loaders.journal_document_metadata import (
|
|
9
|
+
LogseqJournalDocumentMetadata,
|
|
10
|
+
)
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LogseqJournalFilesystemLoader(LogseqJournalLoader):
|
|
18
|
+
"""
|
|
19
|
+
Based on input, load a collection of Logseq journal files from the filesystem, as
|
|
20
|
+
Langchain `Document`s.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
logseq_journal_path: str,
|
|
26
|
+
**kwargs,
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the loader with the path to the Logseq journal directory.
|
|
30
|
+
`logseq_journal_path` should be contain Logesq journal files, such as `2025_03_27.md`
|
|
31
|
+
"""
|
|
32
|
+
self.logseq_journal_path = logseq_journal_path
|
|
33
|
+
self._validate_logseq_journal_path()
|
|
34
|
+
|
|
35
|
+
def load( # type: ignore[override]
|
|
36
|
+
self,
|
|
37
|
+
input: LogseqJournalLoaderInput,
|
|
38
|
+
) -> list[Document]:
|
|
39
|
+
"""
|
|
40
|
+
Synchronously load the documents from the Logseq journal directory, according to the input.
|
|
41
|
+
"""
|
|
42
|
+
# Convert dates to datetime objects once
|
|
43
|
+
if input.start_date > input.end_date:
|
|
44
|
+
raise ValueError("journal_end_date must be after journal_start_date")
|
|
45
|
+
|
|
46
|
+
documents: list[Document] = []
|
|
47
|
+
# TODO this glob pattern can be improved by analyzing start_date & end_date to provide fewer matches
|
|
48
|
+
for path in Path(self.logseq_journal_path).glob("*.md"):
|
|
49
|
+
filename = path.name
|
|
50
|
+
if self._match_journal(filename, input.start_date, input.end_date):
|
|
51
|
+
file_path = os.path.join(self.logseq_journal_path, filename)
|
|
52
|
+
with open(file_path, "r") as file:
|
|
53
|
+
content = file.read()
|
|
54
|
+
documents.extend(
|
|
55
|
+
self.__class__.parse_journal_markdown_file(
|
|
56
|
+
content, filename, input.enable_splitting
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
return documents
|
|
60
|
+
|
|
61
|
+
def _validate_logseq_journal_path(self):
|
|
62
|
+
"""
|
|
63
|
+
Validate the path to the Logseq journal directory. Check that the directory exists.
|
|
64
|
+
If the directory is empty, or does not contain files with the expected format, log a warning.
|
|
65
|
+
"""
|
|
66
|
+
# verify that the path exist, and is a directory
|
|
67
|
+
if not os.path.exists(self.logseq_journal_path):
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Logseq journal path does not exist: {self.logseq_journal_path}"
|
|
70
|
+
)
|
|
71
|
+
if not os.path.isdir(self.logseq_journal_path):
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Logseq journal path is not a directory: {self.logseq_journal_path}"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# verify that the directory contains files with the expected format
|
|
77
|
+
files = os.listdir(self.logseq_journal_path)
|
|
78
|
+
if len(files) == 0:
|
|
79
|
+
logger.warning(
|
|
80
|
+
f"Logseq journal directory is empty: {self.logseq_journal_path}"
|
|
81
|
+
)
|
|
82
|
+
files = Path(self.logseq_journal_path).glob("*.md")
|
|
83
|
+
if not len(list(files)) > 0:
|
|
84
|
+
logger.warning(
|
|
85
|
+
f"No files with .md extension found in {self.logseq_journal_path}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def _match_journal(self, filename: str, start_date: date, end_date: date) -> bool:
|
|
89
|
+
"""
|
|
90
|
+
Return `True` if journal date is between `start_date` & `end_date`.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
filename: The journal filename (e.g., "2025_03_27.md")
|
|
94
|
+
start_date: The start date as a datetime object
|
|
95
|
+
end_date: The end date as a datetime object
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
bool: True if the file's date is within the range, False otherwise
|
|
99
|
+
"""
|
|
100
|
+
if not filename.endswith(".md"):
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Convert filename to date object
|
|
105
|
+
file_date = datetime.strptime(filename[:-3], "%Y_%m_%d").date()
|
|
106
|
+
return start_date <= file_date <= end_date
|
|
107
|
+
except ValueError:
|
|
108
|
+
# If there's any issue parsing the date from filename, skip this file
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def parse_journal_markdown_file(
|
|
113
|
+
content: str, filename: str, enable_splitting: bool = True
|
|
114
|
+
) -> list[Document]:
|
|
115
|
+
"""
|
|
116
|
+
Generate `Document`s from a file's contents. If necessary, split content into digestible
|
|
117
|
+
`Document`s, and attach metadata.
|
|
118
|
+
This function can potentially be augmented by calling Logseq APIs, rather than simply parsing markdown files.
|
|
119
|
+
"""
|
|
120
|
+
sections = content.split("\n- ") if enable_splitting else [content]
|
|
121
|
+
docs = []
|
|
122
|
+
for section in sections:
|
|
123
|
+
if section_content := section.strip():
|
|
124
|
+
# Create a Document
|
|
125
|
+
# first, check that the content length (char count) is acceptable
|
|
126
|
+
# if longer than acceptable, then call recursively
|
|
127
|
+
# TODO: use self.p.max_char_count below instead
|
|
128
|
+
metadata = (
|
|
129
|
+
LogseqJournalFilesystemLoader.parse_journal_markdown_file_metadata(
|
|
130
|
+
section_content, filename
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
docs.append(
|
|
134
|
+
Document(
|
|
135
|
+
page_content=section_content, metadata=metadata.model_dump()
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
return docs
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def parse_journal_markdown_file_metadata(
|
|
142
|
+
section: str, filename: str
|
|
143
|
+
) -> LogseqJournalDocumentMetadata:
|
|
144
|
+
"""
|
|
145
|
+
Parse metadata from a journal markdown file. Return `LogseqMarkdownDocumentMetadata`.
|
|
146
|
+
This function can potentially be augmented by calling Logseq APIs, rather than simply parsing markdown files.
|
|
147
|
+
"""
|
|
148
|
+
# Extract date from filename
|
|
149
|
+
date_str = filename.replace(".md", "").replace("_", "-")
|
|
150
|
+
char_count = len(section)
|
|
151
|
+
|
|
152
|
+
return LogseqJournalDocumentMetadata(
|
|
153
|
+
journal_date=date_str,
|
|
154
|
+
# TODO get tags from Document's contents
|
|
155
|
+
journal_tags=[],
|
|
156
|
+
journal_char_count=char_count,
|
|
157
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from langchain_core.document_loaders import BaseLoader
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LogseqJournalLoader(BaseLoader):
|
|
8
|
+
"""
|
|
9
|
+
Base class for loading Logseq journal files.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def load(self, input: Any) -> list[Document]: # type: ignore[override] # ty: ignore[invalid-method-override]
|
|
13
|
+
raise NotImplementedError("This method should be implemented by subclasses.")
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from datetime import datetime, date
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
from pydantic import (
|
|
5
|
+
BaseModel,
|
|
6
|
+
Field,
|
|
7
|
+
AfterValidator,
|
|
8
|
+
computed_field,
|
|
9
|
+
PrivateAttr,
|
|
10
|
+
model_validator,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _validate_date_format(value: str) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Normalize date string to ISO format (YYYY-MM-DD).
|
|
17
|
+
Accepts dates like '2023-3-1' and converts to '2023-03-01'.
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
_parse_date(value)
|
|
21
|
+
return value
|
|
22
|
+
except ValueError:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
f"Invalid date: '{value}'. Expecting ISO-8601 format: YYYY-MM-DD"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _parse_date(date_str: str) -> date:
|
|
29
|
+
"""
|
|
30
|
+
Parse a date string into a datetime.date object.
|
|
31
|
+
"""
|
|
32
|
+
return datetime.strptime(date_str, "%Y-%m-%d").date()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class LogseqJournalLoaderInput(BaseModel):
|
|
36
|
+
"""
|
|
37
|
+
Input for a Logseq journal `Document` loader, to invoke a load.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
journal_start_date: Annotated[
|
|
41
|
+
str,
|
|
42
|
+
Field(
|
|
43
|
+
description="The start date of the journal to load, in YYYY-MM-DD format.",
|
|
44
|
+
examples=["2023-01-01", "2025-06-09"],
|
|
45
|
+
),
|
|
46
|
+
AfterValidator(_validate_date_format),
|
|
47
|
+
]
|
|
48
|
+
journal_end_date: Annotated[
|
|
49
|
+
str,
|
|
50
|
+
Field(
|
|
51
|
+
description="The end date of the journal to load, in YYYY-MM-DD format.",
|
|
52
|
+
examples=["2023-01-01", "2025-06-09"],
|
|
53
|
+
),
|
|
54
|
+
AfterValidator(_validate_date_format),
|
|
55
|
+
]
|
|
56
|
+
max_char_length: Annotated[
|
|
57
|
+
int,
|
|
58
|
+
Field(
|
|
59
|
+
description="The maximum number of characters to include in a single `Document`.",
|
|
60
|
+
examples=[8196, 2000],
|
|
61
|
+
default=1024 * 8,
|
|
62
|
+
),
|
|
63
|
+
] = 1024 * 8
|
|
64
|
+
enable_splitting: Annotated[
|
|
65
|
+
bool,
|
|
66
|
+
Field(
|
|
67
|
+
description="Whether to split the journal file into multiple `Document`s.",
|
|
68
|
+
examples=[True, False],
|
|
69
|
+
default=True,
|
|
70
|
+
),
|
|
71
|
+
] = True
|
|
72
|
+
|
|
73
|
+
# Private attributes that won't be included in model_dump
|
|
74
|
+
_start_date: date = PrivateAttr()
|
|
75
|
+
_end_date: date = PrivateAttr()
|
|
76
|
+
|
|
77
|
+
@model_validator(mode="after")
|
|
78
|
+
def _parse_dates(self) -> "LogseqJournalLoaderInput":
|
|
79
|
+
"""Parse date strings into date objects after validation."""
|
|
80
|
+
self._start_date = _parse_date(self.journal_start_date)
|
|
81
|
+
self._end_date = _parse_date(self.journal_end_date)
|
|
82
|
+
return self
|
|
83
|
+
|
|
84
|
+
@computed_field
|
|
85
|
+
@property
|
|
86
|
+
def start_date(self) -> date:
|
|
87
|
+
"""Get `journal_start_date` as a date object."""
|
|
88
|
+
return self._start_date
|
|
89
|
+
|
|
90
|
+
@computed_field
|
|
91
|
+
@property
|
|
92
|
+
def end_date(self) -> date:
|
|
93
|
+
"""Get `journal_end_date` as a date object."""
|
|
94
|
+
return self._end_date
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# debugging only
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
from pprint import pprint
|
|
100
|
+
|
|
101
|
+
pprint(LogseqJournalLoaderInput.model_json_schema())
|
|
102
|
+
|
|
103
|
+
example = LogseqJournalLoaderInput(
|
|
104
|
+
journal_start_date="2023-01-01",
|
|
105
|
+
journal_end_date="2023-01-02",
|
|
106
|
+
max_char_length=1024 * 4,
|
|
107
|
+
)
|
|
108
|
+
print(example.model_dump())
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from logseq_retriever.models.journal_pgvector import (
|
|
2
|
+
JournalDocument,
|
|
3
|
+
JournalCorpusMetadata,
|
|
4
|
+
JournalDocumentMetadata,
|
|
5
|
+
JournalSearchClientConfig,
|
|
6
|
+
JournalSearchQuery,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"JournalDocument",
|
|
11
|
+
"JournalCorpusMetadata",
|
|
12
|
+
"JournalDocumentMetadata",
|
|
13
|
+
"JournalSearchClientConfig",
|
|
14
|
+
"JournalSearchQuery",
|
|
15
|
+
]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import Type
|
|
2
|
+
|
|
3
|
+
from pgvector.sqlalchemy import Vector
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
from sqlalchemy import Column, String
|
|
6
|
+
|
|
7
|
+
from pgvector_template.core import (
|
|
8
|
+
BaseDocument,
|
|
9
|
+
BaseDocumentMetadata,
|
|
10
|
+
BaseSearchClientConfig,
|
|
11
|
+
)
|
|
12
|
+
from pgvector_template.models.search import (
|
|
13
|
+
SearchQuery,
|
|
14
|
+
MetadataFilter,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class JournalDocument(BaseDocument):
|
|
19
|
+
"""
|
|
20
|
+
Each `Corpus` is the entire entry for a given date. A corpus may consist of 1 or more chunks of `Document`s.
|
|
21
|
+
Each `Corpus` has a set of metadata, and each `Document` chunk has all of those, plus more.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
__abstract__ = False
|
|
25
|
+
__tablename__ = "logseq_journal"
|
|
26
|
+
|
|
27
|
+
corpus_id = Column(String(len("2025-06-09")), index=True)
|
|
28
|
+
"""Length of ISO date string"""
|
|
29
|
+
embedding = Column(Vector(1024))
|
|
30
|
+
"""Embedding vector"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class JournalCorpusMetadata(BaseDocumentMetadata):
|
|
34
|
+
"""Metadata schema for Logseq journal corpora. Consist of 1-or-more chunks, called `Document`s."""
|
|
35
|
+
|
|
36
|
+
# corpus
|
|
37
|
+
date_str: str = Field(
|
|
38
|
+
pattern=r"^\d{4}-\d{2}-\d{2}$",
|
|
39
|
+
description="Date in ISO format, e.g. `2025-04-20`",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# defaults
|
|
43
|
+
document_type: str = Field(default="logseq_journal")
|
|
44
|
+
schema_version: str = Field(default="2025-07-10")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class JournalDocumentMetadata(JournalCorpusMetadata):
|
|
48
|
+
"""Metadata schema for Logseq journal `Document`s. 1-or-more `Document`s make up a corpus."""
|
|
49
|
+
|
|
50
|
+
# chunk/document
|
|
51
|
+
chunk_len: int = Field()
|
|
52
|
+
"""Length of the content in characters"""
|
|
53
|
+
word_count: int | None = Field()
|
|
54
|
+
"""Length of the content in words"""
|
|
55
|
+
references: list[str] = Field(default=[])
|
|
56
|
+
"""List of references to other Logseq documents, or journal dates"""
|
|
57
|
+
anchor_ids: list[str] = Field(default=[])
|
|
58
|
+
"""Blocks in the document can have UUID anchors, which are referenced elsewhere. This is a list of all present"""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class JournalSearchClientConfig(BaseSearchClientConfig):
|
|
62
|
+
"""Configuration for the Logseq journal search client."""
|
|
63
|
+
|
|
64
|
+
document_cls: Type[BaseDocument] = JournalDocument
|
|
65
|
+
"""The document type to use for the search client."""
|
|
66
|
+
document_metadata_cls: Type[BaseDocumentMetadata] = JournalDocumentMetadata
|
|
67
|
+
"""The document metadata type to use for the search client."""
|
|
68
|
+
# embedding_provider
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class JournalSearchQuery(SearchQuery):
|
|
72
|
+
"""
|
|
73
|
+
Standardized search query structure, specifically for searching Logseq `JournalDocument`s.
|
|
74
|
+
At least 1 search criterion is required (text, keywords, metadata_filters), but multiple are allowed.
|
|
75
|
+
Types are the same as in `SearchQuery`.
|
|
76
|
+
Descriptions are customized to better suit Logseq `JournalDocument`'s.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
text: str | None = None
|
|
80
|
+
"""
|
|
81
|
+
String to match against using in a semantic search, i.e. using vector distance.
|
|
82
|
+
Instead of passing in a question, rephrase the question to be a string/phrase matching closer
|
|
83
|
+
to the content expected to be found.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
keywords: list[str] = []
|
|
87
|
+
"""
|
|
88
|
+
List of keywords to **exact-match**.
|
|
89
|
+
If any keywords are provided, at least 1 keyword must appear in the content,
|
|
90
|
+
so use only if certain that the word will appear.
|
|
91
|
+
Do not include keywords that can be covered in metadata_filters, e.g. dates, document type.
|
|
92
|
+
If you are not certain that a word will appear, try using `text` for a semantic search instead.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
metadata_filters: list[MetadataFilter] = Field(
|
|
96
|
+
default=[],
|
|
97
|
+
json_schema_extra={
|
|
98
|
+
"metadata_schema": JournalDocumentMetadata.model_json_schema()
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
"""
|
|
102
|
+
List of metadata conditions that must be matched.
|
|
103
|
+
Refer to `metadata_schema` for the expected schema, as it exists in the database.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
limit: int = Field(20, ge=3)
|
|
107
|
+
"""Maximum number of results to return."""
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from logseq_retriever.retrievers.contextualizer import (
|
|
2
|
+
RetrieverContextualizerProps,
|
|
3
|
+
RetrieverContextualizer,
|
|
4
|
+
)
|
|
5
|
+
from logseq_retriever.retrievers.journal_retriever import LogseqJournalRetriever
|
|
6
|
+
from logseq_retriever.retrievers.journal_date_range_retriever import (
|
|
7
|
+
LogseqJournalDateRangeRetriever,
|
|
8
|
+
)
|
|
9
|
+
from logseq_retriever.retrievers.pgvector_journal_retriever import (
|
|
10
|
+
PGVectorJournalRetriever,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"RetrieverContextualizerProps",
|
|
15
|
+
"RetrieverContextualizer",
|
|
16
|
+
"LogseqJournalRetriever",
|
|
17
|
+
"LogseqJournalDateRangeRetriever",
|
|
18
|
+
"PGVectorJournalRetriever",
|
|
19
|
+
]
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from logging import getLogger
|
|
3
|
+
from textwrap import dedent
|
|
4
|
+
from typing import Annotated, Any, Optional, Type
|
|
5
|
+
|
|
6
|
+
from langchain_core.language_models import BaseLanguageModel
|
|
7
|
+
from langchain_core.runnables import Runnable
|
|
8
|
+
from langchain_core.prompts import PromptTemplate
|
|
9
|
+
from langchain_core.output_parsers import PydanticOutputParser
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logger = getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
RETRIEVER_CONTEXTUALIZER_PROMPT_TEMPLATE = dedent(
|
|
17
|
+
"""\
|
|
18
|
+
{prompt}
|
|
19
|
+
|
|
20
|
+
Latest user input: {user_input}
|
|
21
|
+
|
|
22
|
+
{format_instructions}
|
|
23
|
+
"""
|
|
24
|
+
)
|
|
25
|
+
RETRIEVER_CONTEXTUALIZER_PROMPT_TEMPLATE_WITH_CHAT_HISTORY = dedent(
|
|
26
|
+
"""\
|
|
27
|
+
Realtime context:
|
|
28
|
+
{realtime_context}
|
|
29
|
+
|
|
30
|
+
{prompt}
|
|
31
|
+
|
|
32
|
+
Chat History:
|
|
33
|
+
{chat_history}
|
|
34
|
+
|
|
35
|
+
Latest user input: {user_input}
|
|
36
|
+
|
|
37
|
+
{format_instructions}
|
|
38
|
+
"""
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class RetrieverContextualizerProps(BaseModel):
|
|
43
|
+
"""
|
|
44
|
+
Contextualizers are a component within Langchain `Retriever`s, that transform a natural-language
|
|
45
|
+
input (and history) into an actionable query, which can in turn be used to fetch relevant
|
|
46
|
+
`Document`s to answer address the input. The actionable query output can be structured, or
|
|
47
|
+
simply another string that can be used to query a Vectorstore.
|
|
48
|
+
|
|
49
|
+
To do this, the core of the Contextualizer is an LLM. The `prompt` is used by the LLM to perform
|
|
50
|
+
the transformation task.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
llm: Annotated[
|
|
54
|
+
BaseLanguageModel,
|
|
55
|
+
Field(
|
|
56
|
+
"The LLM that will be used to transform the input into an actionable query.",
|
|
57
|
+
),
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
prompt: Annotated[
|
|
61
|
+
str,
|
|
62
|
+
Field(
|
|
63
|
+
description="The prompt to use for the LLM to transform the input into an actionable query.",
|
|
64
|
+
examples=[
|
|
65
|
+
"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n{chat_history}\nFollow Up Input: {user_input}\nStandalone question:"
|
|
66
|
+
],
|
|
67
|
+
default="Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n{chat_history}\nFollow Up Input: {user_input}\nStandalone question:",
|
|
68
|
+
),
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
# TODO impl validation on this schema
|
|
72
|
+
output_schema: Annotated[
|
|
73
|
+
Optional[Type[BaseModel]],
|
|
74
|
+
Field(
|
|
75
|
+
description="(Optional) Structured output schema, as a Pydantic `BaseModel`. If provided, will be added to the end of the prompt.",
|
|
76
|
+
default=None,
|
|
77
|
+
),
|
|
78
|
+
] = None
|
|
79
|
+
|
|
80
|
+
enable_chat_history: Annotated[
|
|
81
|
+
bool,
|
|
82
|
+
Field(
|
|
83
|
+
description="Whether to enable chat history in the prompt.",
|
|
84
|
+
default=True,
|
|
85
|
+
),
|
|
86
|
+
] = True
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class RetrieverContextualizer(Runnable):
|
|
90
|
+
"""
|
|
91
|
+
A Runnable that transforms natural language input into an actionable query
|
|
92
|
+
for retrievers, based on the provided configuration.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __init__(self, props: RetrieverContextualizerProps):
|
|
96
|
+
"""Initialize with validated props."""
|
|
97
|
+
self.props = props
|
|
98
|
+
self.chain = self._generate_chain()
|
|
99
|
+
self._parser_type = self.parser._type
|
|
100
|
+
self._output_type = (
|
|
101
|
+
self.parser.OutputType
|
|
102
|
+
if not self.props.output_schema
|
|
103
|
+
else self.props.output_schema
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _generate_chain(self) -> Runnable:
|
|
107
|
+
"""
|
|
108
|
+
Generate and return the appropriate chain based on props.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
A Runnable chain that processes inputs according to the configuration.
|
|
112
|
+
"""
|
|
113
|
+
# If output schema is provided, use PydanticOutputParser
|
|
114
|
+
if self.props.output_schema:
|
|
115
|
+
self.parser = PydanticOutputParser(pydantic_object=self.props.output_schema)
|
|
116
|
+
# create a PromptTemplate with partials
|
|
117
|
+
self.prompt_template = PromptTemplate(
|
|
118
|
+
input_variables=(
|
|
119
|
+
["chat_history", "user_input"]
|
|
120
|
+
if self.props.enable_chat_history
|
|
121
|
+
else ["user_input"]
|
|
122
|
+
),
|
|
123
|
+
partial_variables={
|
|
124
|
+
"realtime_context": self._get_realtime_context(),
|
|
125
|
+
"prompt": self.props.prompt,
|
|
126
|
+
"format_instructions": self.parser.get_format_instructions(),
|
|
127
|
+
},
|
|
128
|
+
template=(
|
|
129
|
+
RETRIEVER_CONTEXTUALIZER_PROMPT_TEMPLATE_WITH_CHAT_HISTORY
|
|
130
|
+
if self.props.enable_chat_history
|
|
131
|
+
else RETRIEVER_CONTEXTUALIZER_PROMPT_TEMPLATE
|
|
132
|
+
),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
else:
|
|
136
|
+
# Otherwise, use the LLM and extract the string content
|
|
137
|
+
# This ensures we get a clean string output rather than an LLM result object
|
|
138
|
+
from langchain_core.output_parsers import StrOutputParser
|
|
139
|
+
|
|
140
|
+
self.parser = StrOutputParser()
|
|
141
|
+
self.prompt_template = PromptTemplate.from_template(self.props.prompt)
|
|
142
|
+
|
|
143
|
+
# can enable for debugging, will not fail
|
|
144
|
+
return (
|
|
145
|
+
self.prompt_template
|
|
146
|
+
| (lambda x: logger.debug(f"Contextualizer prompt: {x}") or x)
|
|
147
|
+
| self.props.llm
|
|
148
|
+
| (lambda x: logger.debug(f"Contextualizer LLM output: {x}") or x)
|
|
149
|
+
| self.parser
|
|
150
|
+
| (lambda x: logger.info(f"OutputParser output: {x}") or x)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def invoke(self, input: dict[str, Any], config=None, **kwargs) -> Any:
|
|
154
|
+
"""
|
|
155
|
+
Process the input through the chain.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
input: The input to process, typically containing 'question' and 'chat_history'.
|
|
159
|
+
config: Optional configuration for the chain.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
The processed output, either a string or a structured object based on the output_schema.
|
|
163
|
+
"""
|
|
164
|
+
return self.chain.invoke(input, config=config, **kwargs)
|
|
165
|
+
|
|
166
|
+
@staticmethod
|
|
167
|
+
def _get_realtime_context() -> str:
|
|
168
|
+
"""
|
|
169
|
+
Get a string representing the realtime context of the retriever, with info such as:
|
|
170
|
+
- current datetime
|
|
171
|
+
"""
|
|
172
|
+
return f"Current datetime: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from logging import getLogger
|
|
3
|
+
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
from langchain_core.messages import BaseMessage
|
|
6
|
+
from logseq_retriever.loaders import LogseqJournalLoader
|
|
7
|
+
from logseq_retriever.retrievers.journal_retriever import LogseqJournalRetriever
|
|
8
|
+
from logseq_retriever.retrievers.contextualizer import RetrieverContextualizer
|
|
9
|
+
from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LogseqJournalDateRangeRetriever(LogseqJournalRetriever):
|
|
16
|
+
"""
|
|
17
|
+
A `Retriever` that retrieves documents from a Logseq journal within a specified date range.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
contextualizer: RetrieverContextualizer,
|
|
23
|
+
loader: LogseqJournalLoader,
|
|
24
|
+
verbose: bool = True,
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Initialize the `Retriever` with a contextualizer and a loader.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
contextualizer (`RetrieverContextualizer`)
|
|
31
|
+
loader (`LogseqJournalLoader`)
|
|
32
|
+
"""
|
|
33
|
+
super().__init__()
|
|
34
|
+
|
|
35
|
+
if not isinstance(contextualizer, RetrieverContextualizer):
|
|
36
|
+
raise TypeError(
|
|
37
|
+
"Contextualizer must be an instance of RetrieverContextualizer"
|
|
38
|
+
)
|
|
39
|
+
if contextualizer._output_type != LogseqJournalLoaderInput:
|
|
40
|
+
raise TypeError(
|
|
41
|
+
"Contextualizer output type must be LogseqJournalLoaderInput"
|
|
42
|
+
)
|
|
43
|
+
self._contextualizer = contextualizer
|
|
44
|
+
|
|
45
|
+
if not isinstance(loader, LogseqJournalLoader):
|
|
46
|
+
raise TypeError("Loader must be an instance of LogseqJournalLoader")
|
|
47
|
+
self._loader = loader
|
|
48
|
+
self._verbose = verbose
|
|
49
|
+
|
|
50
|
+
def _build_loader_input(
|
|
51
|
+
self,
|
|
52
|
+
query: str,
|
|
53
|
+
chat_history: Sequence[BaseMessage] = (),
|
|
54
|
+
) -> LogseqJournalLoaderInput:
|
|
55
|
+
"""
|
|
56
|
+
Based on the natural-language `query`, return an instance of `LogseqJournalLoaderInput`,
|
|
57
|
+
which can then be used to invoke the `LogseqJournalLoader`.
|
|
58
|
+
Use the `RetrieverContextualizer` to do this.
|
|
59
|
+
"""
|
|
60
|
+
contextualizer_input = {
|
|
61
|
+
"chat_history": chat_history,
|
|
62
|
+
"user_input": query,
|
|
63
|
+
}
|
|
64
|
+
loader_input = self._contextualizer.invoke(contextualizer_input)
|
|
65
|
+
if self._verbose:
|
|
66
|
+
logger.info(f"Contextualizer output: {loader_input}")
|
|
67
|
+
if not isinstance(loader_input, LogseqJournalLoaderInput):
|
|
68
|
+
raise TypeError(
|
|
69
|
+
f"Expected LogseqJournalLoaderInput but got {type(loader_input).__name__}"
|
|
70
|
+
)
|
|
71
|
+
return loader_input
|
|
72
|
+
|
|
73
|
+
def _fetch_documents(
|
|
74
|
+
self, loader_input: LogseqJournalLoaderInput
|
|
75
|
+
) -> list[Document]:
|
|
76
|
+
docs = self._loader.load(loader_input)
|
|
77
|
+
if self._verbose:
|
|
78
|
+
logger.info(f"Retrieved {len(docs)} documents")
|
|
79
|
+
return docs
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from logging import getLogger
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
|
|
7
|
+
from langchain_core.documents import Document
|
|
8
|
+
from langchain_core.exceptions import OutputParserException
|
|
9
|
+
from langchain_core.retrievers import BaseRetriever
|
|
10
|
+
from langchain_core.messages import BaseMessage
|
|
11
|
+
from pydantic import ValidationError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LogseqJournalRetriever(BaseRetriever):
|
|
18
|
+
"""
|
|
19
|
+
A Langchain `Retriever` that is specifically for retrieving Logseq journal `Document`'s,
|
|
20
|
+
based on a natural-language query. This `Retriever` will, in turn, leverage a Loader or
|
|
21
|
+
Vectorstore to retrieve relevant documents to the query.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
document_context: str = "These Documents represent journal entries. "
|
|
25
|
+
|
|
26
|
+
def retrieve(
|
|
27
|
+
self, query: str, chat_history: Sequence[BaseMessage] | None = None
|
|
28
|
+
) -> list[Document]:
|
|
29
|
+
"""
|
|
30
|
+
Directly retrieve documents for a query, bypassing LangChain's `invoke()` machinery.
|
|
31
|
+
|
|
32
|
+
Note: unlike `invoke()`, this method does not trigger LangSmith tracing or
|
|
33
|
+
registered callbacks. Use `invoke()` if those are needed.
|
|
34
|
+
"""
|
|
35
|
+
return self._execute(query, chat_history or ())
|
|
36
|
+
|
|
37
|
+
def _get_relevant_documents(
|
|
38
|
+
self,
|
|
39
|
+
query: str | dict[str, Any],
|
|
40
|
+
*,
|
|
41
|
+
run_manager: CallbackManagerForRetrieverRun,
|
|
42
|
+
chat_history: Sequence[BaseMessage] | None = None,
|
|
43
|
+
) -> list[Document]:
|
|
44
|
+
"""
|
|
45
|
+
Called by `invoke`.
|
|
46
|
+
|
|
47
|
+
`query` can be provided as a `str` (a natural-language query), or as a dict where
|
|
48
|
+
`chat_history` can be provided additionally. Format:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
query = {
|
|
52
|
+
"user_input": "user's latest question",
|
|
53
|
+
"chat_history": [("AiMessage", )]
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Returns potentially relevant `langchain_core.documents.Document`s to answer the query.
|
|
58
|
+
"""
|
|
59
|
+
# Handle case where query is passed as a dictionary (e.g., {"user_input": "query", "chat_history": [...]})
|
|
60
|
+
if isinstance(query, dict):
|
|
61
|
+
actual_query = (
|
|
62
|
+
query.get("user_input") or query.get("input") or query.get("query", "")
|
|
63
|
+
)
|
|
64
|
+
chat_history = (
|
|
65
|
+
chat_history or query.get("chat_history") or query.get("history")
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
actual_query = query
|
|
69
|
+
|
|
70
|
+
return self._execute(actual_query, chat_history or [])
|
|
71
|
+
|
|
72
|
+
def _execute(
|
|
73
|
+
self, query: str, chat_history: Sequence[BaseMessage]
|
|
74
|
+
) -> list[Document]:
|
|
75
|
+
try:
|
|
76
|
+
loader_input = self._build_loader_input(query, chat_history)
|
|
77
|
+
except (TypeError, ValidationError, OutputParserException):
|
|
78
|
+
logger.exception("Error building loader input")
|
|
79
|
+
return []
|
|
80
|
+
return self._fetch_documents(loader_input)
|
|
81
|
+
|
|
82
|
+
@abstractmethod
|
|
83
|
+
def _fetch_documents(
|
|
84
|
+
self,
|
|
85
|
+
loader_input: Any,
|
|
86
|
+
) -> list[Document]:
|
|
87
|
+
"""
|
|
88
|
+
Subclasses shall impl this method.
|
|
89
|
+
Return a list of `langchain_core.documents.Document`s based on the user's query
|
|
90
|
+
(and chat_history if available).
|
|
91
|
+
"""
|
|
92
|
+
raise NotImplementedError("This method shall be implemented by subclasses.")
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def _build_loader_input(
|
|
96
|
+
self,
|
|
97
|
+
query: str,
|
|
98
|
+
chat_history: Sequence[BaseMessage] = (),
|
|
99
|
+
) -> Any:
|
|
100
|
+
"""
|
|
101
|
+
Subclasses shall impl this method.
|
|
102
|
+
Return a dataclass, based on the user's query and chat_history if available, which shall
|
|
103
|
+
be used in the subsequent step to load/query for relevant documents.
|
|
104
|
+
"""
|
|
105
|
+
raise NotImplementedError("This method shall be implemented by subclasses.")
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from logging import getLogger
|
|
3
|
+
|
|
4
|
+
from langchain_core.messages import BaseMessage
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from pgvector_template.core.search import SearchQuery
|
|
7
|
+
from pgvector_template.service import DocumentService
|
|
8
|
+
|
|
9
|
+
from logseq_retriever.retrievers.contextualizer import RetrieverContextualizer
|
|
10
|
+
from logseq_retriever.retrievers.journal_retriever import LogseqJournalRetriever
|
|
11
|
+
from logseq_retriever.models.journal_pgvector import JournalDocument
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PGVectorJournalRetriever(LogseqJournalRetriever):
|
|
18
|
+
"""
|
|
19
|
+
A `Retriever` that relies on a PGVector backend to fetch Logseq journals.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
contextualizer: RetrieverContextualizer,
|
|
25
|
+
document_service: DocumentService,
|
|
26
|
+
verbose: bool = True,
|
|
27
|
+
**kwargs,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the `Retriever` with a contextualizer and a loader.
|
|
31
|
+
"""
|
|
32
|
+
super().__init__()
|
|
33
|
+
|
|
34
|
+
if not isinstance(contextualizer, RetrieverContextualizer):
|
|
35
|
+
raise TypeError(
|
|
36
|
+
"contextualizer must be an instance of RetrieverContextualizer"
|
|
37
|
+
)
|
|
38
|
+
if not issubclass(contextualizer._output_type, SearchQuery):
|
|
39
|
+
raise TypeError(
|
|
40
|
+
"contextualizer._output_type must be SearchQuery or a subclass"
|
|
41
|
+
)
|
|
42
|
+
self._contextualizer = contextualizer
|
|
43
|
+
|
|
44
|
+
if not isinstance(document_service, DocumentService):
|
|
45
|
+
raise TypeError("document_service must be an instance of DocumentService")
|
|
46
|
+
self._document_service = document_service
|
|
47
|
+
self._verbose = verbose
|
|
48
|
+
|
|
49
|
+
def _build_loader_input(
|
|
50
|
+
self,
|
|
51
|
+
query: str,
|
|
52
|
+
chat_history: Sequence[BaseMessage] = (),
|
|
53
|
+
) -> SearchQuery:
|
|
54
|
+
"""
|
|
55
|
+
Based on the natural-language `query`, return an instance of `SearchQuery`,
|
|
56
|
+
which can then be used to invoke the `DocumentService.search_client.search`.
|
|
57
|
+
Use the `RetrieverContextualizer` to do this.
|
|
58
|
+
"""
|
|
59
|
+
contextualizer_input = {
|
|
60
|
+
"chat_history": chat_history,
|
|
61
|
+
"user_input": query,
|
|
62
|
+
}
|
|
63
|
+
db_query = self._contextualizer.invoke(contextualizer_input)
|
|
64
|
+
if self._verbose:
|
|
65
|
+
logger.info(f"Contextualizer output: {db_query}")
|
|
66
|
+
if not isinstance(db_query, SearchQuery):
|
|
67
|
+
raise TypeError(
|
|
68
|
+
f"Expected SearchQuery or subclass but got {type(db_query).__name__}"
|
|
69
|
+
)
|
|
70
|
+
return db_query
|
|
71
|
+
|
|
72
|
+
def _fetch_documents(self, loader_input: SearchQuery) -> list[Document]:
|
|
73
|
+
"""
|
|
74
|
+
Return a list of `langchain_core.documents.Document`s based on the user's query
|
|
75
|
+
(and chat_history if available).
|
|
76
|
+
`load_input` shall be an instance of `SearchQuery` or a subclass, in this context.
|
|
77
|
+
"""
|
|
78
|
+
db_results = self._document_service.search_client.search(loader_input)
|
|
79
|
+
if self._verbose:
|
|
80
|
+
logger.info(f"Retrieved {len(db_results)} documents from PGVector.")
|
|
81
|
+
return [
|
|
82
|
+
self._build_langchain_document_from_pgvector_document(result.document)
|
|
83
|
+
for result in db_results
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
def _build_langchain_document_from_pgvector_document(
|
|
87
|
+
self, pgvector_document: JournalDocument
|
|
88
|
+
) -> Document:
|
|
89
|
+
"""
|
|
90
|
+
Build a LangChain document from a PGVector document.
|
|
91
|
+
"""
|
|
92
|
+
return Document(
|
|
93
|
+
page_content=str(pgvector_document.content),
|
|
94
|
+
metadata=pgvector_document.document_metadata,
|
|
95
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Type
|
|
3
|
+
|
|
4
|
+
from pgvector_template.core import (
|
|
5
|
+
BaseCorpusManager,
|
|
6
|
+
BaseCorpusManagerConfig,
|
|
7
|
+
BaseDocument,
|
|
8
|
+
BaseDocumentMetadata,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from logseq_retriever.models.journal_pgvector import (
|
|
12
|
+
JournalDocument,
|
|
13
|
+
JournalDocumentMetadata,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class JournalCorpusManagerConfig(BaseCorpusManagerConfig):
|
|
18
|
+
"""Configuration for Logseq journal `JournalCorpusManager`."""
|
|
19
|
+
|
|
20
|
+
schema_name: str = "logseq_journal"
|
|
21
|
+
"""Name of the schema to use for the corpus manager"""
|
|
22
|
+
document_cls: Type[BaseDocument] = JournalDocument
|
|
23
|
+
"""Class to use for the document model"""
|
|
24
|
+
document_metadata_cls: Type[BaseDocumentMetadata] = JournalDocumentMetadata
|
|
25
|
+
"""Class to use for the document metadata model"""
|
|
26
|
+
# embedding_provider: BaseEmbeddingProvider # is still required
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class JournalCorpusManager(BaseCorpusManager):
|
|
30
|
+
"""
|
|
31
|
+
CorpusManager declaration for Logseq journals. Each `Corpus` is the entire entry for a given date.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def _split_corpus(self, content: str, **kwargs) -> list[str]:
|
|
35
|
+
"""Split the journal file on root-level bullet points"""
|
|
36
|
+
split_content = content.split("\n-")
|
|
37
|
+
return [
|
|
38
|
+
cleaned_chunk
|
|
39
|
+
for chunk in split_content
|
|
40
|
+
if (cleaned_chunk := chunk.strip().removeprefix("-").removeprefix(" "))
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
def _extract_chunk_metadata(self, content: str, **kwargs) -> dict[str, Any]:
|
|
44
|
+
"""Extract metadata from chunk content"""
|
|
45
|
+
# Add some basic metadata about the chunk
|
|
46
|
+
split_content = content.split()
|
|
47
|
+
return {
|
|
48
|
+
"chunk_len": len(content),
|
|
49
|
+
"word_count": len(split_content),
|
|
50
|
+
"references": self._extract_chunk_references(split_content),
|
|
51
|
+
"anchor_ids": self._extract_anchor_ids(content),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
def _extract_chunk_references(self, split_content: list[str]) -> list[str]:
|
|
55
|
+
"""
|
|
56
|
+
Extract references to other Logseq corpora, including other journals.
|
|
57
|
+
Expected to start with `#`, e.g. `#2025-07-07`, `#cookout`.
|
|
58
|
+
Special chars `!?,:'"\\` break references. `\\` is ignored.
|
|
59
|
+
"""
|
|
60
|
+
references = []
|
|
61
|
+
for word in split_content:
|
|
62
|
+
if word.startswith("#"):
|
|
63
|
+
ref = word.lstrip("#").rstrip("#").replace("\\", "")
|
|
64
|
+
for char in "!?,:'\"":
|
|
65
|
+
ref = ref.split(char)[0]
|
|
66
|
+
if ref:
|
|
67
|
+
references.append(ref)
|
|
68
|
+
return references
|
|
69
|
+
|
|
70
|
+
def _extract_anchor_ids(self, content: str) -> list[str]:
|
|
71
|
+
"""Extract Logseq anchor IDs from content (id:: <uuid>)"""
|
|
72
|
+
return re.findall(r"id:: ([a-f0-9-]{36})", content)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: logseq-retriever
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: Python library for loading and retrieving Logseq documents
|
|
5
|
+
Author-email: DL <v49t9zpqd@mozmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.11
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: langchain<0.4.0,>=0.3.24
|
|
14
|
+
Requires-Dist: pydantic<3.0,>=2.11
|
|
15
|
+
Requires-Dist: pgvector-template>=0.3.4
|
|
16
|
+
Provides-Extra: scripts
|
|
17
|
+
Requires-Dist: python-dotenv; extra == "scripts"
|
|
18
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == "scripts"
|
|
19
|
+
Requires-Dist: langchain-aws>=0.2.0; extra == "scripts"
|
|
20
|
+
Provides-Extra: test
|
|
21
|
+
Requires-Dist: pytest; extra == "test"
|
|
22
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
23
|
+
Requires-Dist: python-dotenv; extra == "test"
|
|
24
|
+
Requires-Dist: boto3; extra == "test"
|
|
25
|
+
Requires-Dist: boto3-stubs; extra == "test"
|
|
26
|
+
Requires-Dist: langchain-aws>=0.2.0; extra == "test"
|
|
27
|
+
Requires-Dist: psycopg[binary]>=3.0.0; extra == "test"
|
|
28
|
+
Requires-Dist: ty; extra == "test"
|
|
29
|
+
Requires-Dist: ruff; extra == "test"
|
|
30
|
+
Provides-Extra: dist
|
|
31
|
+
Requires-Dist: build>=1.2.2; extra == "dist"
|
|
32
|
+
Requires-Dist: twine>=6.1.0; extra == "dist"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# Logseq Retriever
|
|
36
|
+
Python library for loading and retrieving Logseq documents.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
## Components
|
|
40
|
+
This section provides an overview of the components provided, listed by type
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
### Retrievers
|
|
44
|
+
Retrievers inject context into a conversation. Works in tandem with a Contextualizer and `Document` Loader.
|
|
45
|
+
- **Input**:
|
|
46
|
+
- natural-language user-input, usually query-like
|
|
47
|
+
- (optional) chat history
|
|
48
|
+
- **Output**:
|
|
49
|
+
- list of `Document`s to provide context for an LLM to answer the user-input
|
|
50
|
+
|
|
51
|
+
#### Implementations
|
|
52
|
+
- `LogseqJournalDateRangeRetriever`
|
|
53
|
+
- retrieve Logseq journal `Document`s, intended for queries that require context from a date range
|
|
54
|
+
- required to set up:
|
|
55
|
+
- `RetrieverContextualizer`
|
|
56
|
+
- `LogseqJournalLoader`
|
|
57
|
+
- examples:
|
|
58
|
+
- "What did I do over Christmas break 2024?"
|
|
59
|
+
- "How did I spend the last Independence Day?"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
### Contextualizers
|
|
63
|
+
Contextualizers serve as the bridge between natural-language input and a downstream component that
|
|
64
|
+
handles fetching of relevant `Document`s.
|
|
65
|
+
- **Input**:
|
|
66
|
+
- natural-language user-input, usually query-like
|
|
67
|
+
- (optional) chat history
|
|
68
|
+
- **Output**:
|
|
69
|
+
- structured downstream query, based on
|
|
70
|
+
|
|
71
|
+
In this library, an instance of `RetrieverContextualizer` is provided directly to
|
|
72
|
+
`Retriever`s during the latter's instantiation. To set up the `RetrieverContextualizer`, provide
|
|
73
|
+
`RetrieverContextualizerProps`, which includes:
|
|
74
|
+
- `llm` - this is the backbone of the contextualizer
|
|
75
|
+
- `prompt` - instructions provided to the LLM
|
|
76
|
+
- `output_schema` - (optional) structured schema used to fetch relevant `Document`s
|
|
77
|
+
- if no schema is provided, a string shall be returned instead
|
|
78
|
+
- other flags and settings
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
### Loaders
|
|
82
|
+
Loaders are one type of component that can fetch relevant `Document`s. Loaders are typically specific to
|
|
83
|
+
a corresponding `Retriever` component.
|
|
84
|
+
- **Input**:
|
|
85
|
+
- each loader specifies its own schema
|
|
86
|
+
- the Contextualizer is usually responsible for creating an instance of the query obj to act upon
|
|
87
|
+
- **Output**:
|
|
88
|
+
- `list[Document]`
|
|
89
|
+
|
|
90
|
+
#### Implementations
|
|
91
|
+
- `LogseqJournalFilesystemLoader`
|
|
92
|
+
- loads from the filesystem, where journal files are expected to be present at specified path
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
## Scripts
|
|
97
|
+
|
|
98
|
+
### PGVector
|
|
99
|
+
|
|
100
|
+
#### `upload_journal`
|
|
101
|
+
|
|
102
|
+
usage: `python scripts/upload_journal_to_pgvector.py [-h] [-p PATH] from_date to_date`
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
logseq_retriever/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
logseq_retriever/loaders/__init__.py,sha256=L5qqUEjx9A5sWdzHG-4c04z_KTxPnihJDh15gVmtWcs,507
|
|
3
|
+
logseq_retriever/loaders/journal_document_metadata.py,sha256=q5XDHX95gnOePP9v-Fwv79sYBrqP8p9Qh4ZyFoddPRk,744
|
|
4
|
+
logseq_retriever/loaders/journal_filesystem_loader.py,sha256=w0Z5Z8YLe8TLWTKbakRTPJ7dRr9qUMJZUFik9IHb9RE,6244
|
|
5
|
+
logseq_retriever/loaders/journal_loader.py,sha256=9e2bK_LDrTYHlUlHamD7MgGzFq8DBCAiBVweXAUtnzM,433
|
|
6
|
+
logseq_retriever/loaders/journal_loader_input.py,sha256=A88-NueZs6h9fNRs5Vm6Y00PgRZImYji2p6Wum-2edA,2963
|
|
7
|
+
logseq_retriever/models/__init__.py,sha256=nkbyo3WbzF4Z4XQAM-IT_SDNFytRo6UGEc_L84j4K-o,346
|
|
8
|
+
logseq_retriever/models/journal_pgvector.py,sha256=wtn2SM_GKqK45QZnTH8tEoDlBPWOlGfcRWGl65YvcF8,3759
|
|
9
|
+
logseq_retriever/retrievers/__init__.py,sha256=6MOLyW01yHK_AYpi_L7fKD9nHhJ6XYds_pGi5bAiQBM,597
|
|
10
|
+
logseq_retriever/retrievers/contextualizer.py,sha256=61YYXVanrHdgL_96bRMtob7Rzs0LuhEJpVb6lOwo__A,6141
|
|
11
|
+
logseq_retriever/retrievers/journal_date_range_retriever.py,sha256=dB0jDY5JaA_YjZf2G1tGC2uy_e0gVV0PSX61rb7pe4E,2850
|
|
12
|
+
logseq_retriever/retrievers/journal_retriever.py,sha256=e7MK2GRVDqo56B-xLEpF0HElZxNWVm50O1zrLx7uLio,3768
|
|
13
|
+
logseq_retriever/retrievers/pgvector_journal_retriever.py,sha256=vYcnicWLdqBIgSgjb_xnNMKvrSyCDGKz6rmGwr9RTdw,3525
|
|
14
|
+
logseq_retriever/uploaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
logseq_retriever/uploaders/pgvector/__init__.py,sha256=Y5wQzHtrEXaUwP05JZ8RqP5en_udUUV-f8ptbGl4MQE,210
|
|
16
|
+
logseq_retriever/uploaders/pgvector/journal_corpus_manager.py,sha256=7xCV5cgI4mJHPxTG92WMH8ylXOjB0PP7obDKwXxNuOU,2655
|
|
17
|
+
logseq_retriever-0.4.1.dist-info/licenses/LICENSE,sha256=4chADZoF7TXixgJtj6FYx2PiAjCMreSUMHevGcgdSG4,1069
|
|
18
|
+
logseq_retriever-0.4.1.dist-info/METADATA,sha256=ZwfjrzO5REJpb_cJ4Qh_rg8zXL_6DqOMd5waNBEgeUU,3563
|
|
19
|
+
logseq_retriever-0.4.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
20
|
+
logseq_retriever-0.4.1.dist-info/top_level.txt,sha256=1WZc9D05rwFgELxnbrEddvzcscLyUXD1ODwLpVx0r90,17
|
|
21
|
+
logseq_retriever-0.4.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 David Ge Liu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
logseq_retriever
|