logseq-retriever 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logseq_retriever-0.4.1/LICENSE +21 -0
- logseq_retriever-0.4.1/PKG-INFO +102 -0
- logseq_retriever-0.4.1/README.md +68 -0
- logseq_retriever-0.4.1/logseq_retriever/__init__.py +0 -0
- logseq_retriever-0.4.1/logseq_retriever/loaders/__init__.py +15 -0
- logseq_retriever-0.4.1/logseq_retriever/loaders/journal_document_metadata.py +32 -0
- logseq_retriever-0.4.1/logseq_retriever/loaders/journal_filesystem_loader.py +157 -0
- logseq_retriever-0.4.1/logseq_retriever/loaders/journal_loader.py +13 -0
- logseq_retriever-0.4.1/logseq_retriever/loaders/journal_loader_input.py +108 -0
- logseq_retriever-0.4.1/logseq_retriever/models/__init__.py +15 -0
- logseq_retriever-0.4.1/logseq_retriever/models/journal_pgvector.py +107 -0
- logseq_retriever-0.4.1/logseq_retriever/retrievers/__init__.py +19 -0
- logseq_retriever-0.4.1/logseq_retriever/retrievers/contextualizer.py +172 -0
- logseq_retriever-0.4.1/logseq_retriever/retrievers/journal_date_range_retriever.py +79 -0
- logseq_retriever-0.4.1/logseq_retriever/retrievers/journal_retriever.py +105 -0
- logseq_retriever-0.4.1/logseq_retriever/retrievers/pgvector_journal_retriever.py +95 -0
- logseq_retriever-0.4.1/logseq_retriever/uploaders/__init__.py +0 -0
- logseq_retriever-0.4.1/logseq_retriever/uploaders/pgvector/__init__.py +9 -0
- logseq_retriever-0.4.1/logseq_retriever/uploaders/pgvector/journal_corpus_manager.py +72 -0
- logseq_retriever-0.4.1/logseq_retriever.egg-info/PKG-INFO +102 -0
- logseq_retriever-0.4.1/logseq_retriever.egg-info/SOURCES.txt +24 -0
- logseq_retriever-0.4.1/logseq_retriever.egg-info/dependency_links.txt +1 -0
- logseq_retriever-0.4.1/logseq_retriever.egg-info/requires.txt +23 -0
- logseq_retriever-0.4.1/logseq_retriever.egg-info/top_level.txt +1 -0
- logseq_retriever-0.4.1/pyproject.toml +57 -0
- logseq_retriever-0.4.1/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 David Ge Liu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: logseq-retriever
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: Python library for loading and retrieving Logseq documents
|
|
5
|
+
Author-email: DL <v49t9zpqd@mozmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.11
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: langchain<0.4.0,>=0.3.24
|
|
14
|
+
Requires-Dist: pydantic<3.0,>=2.11
|
|
15
|
+
Requires-Dist: pgvector-template>=0.3.4
|
|
16
|
+
Provides-Extra: scripts
|
|
17
|
+
Requires-Dist: python-dotenv; extra == "scripts"
|
|
18
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == "scripts"
|
|
19
|
+
Requires-Dist: langchain-aws>=0.2.0; extra == "scripts"
|
|
20
|
+
Provides-Extra: test
|
|
21
|
+
Requires-Dist: pytest; extra == "test"
|
|
22
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
23
|
+
Requires-Dist: python-dotenv; extra == "test"
|
|
24
|
+
Requires-Dist: boto3; extra == "test"
|
|
25
|
+
Requires-Dist: boto3-stubs; extra == "test"
|
|
26
|
+
Requires-Dist: langchain-aws>=0.2.0; extra == "test"
|
|
27
|
+
Requires-Dist: psycopg[binary]>=3.0.0; extra == "test"
|
|
28
|
+
Requires-Dist: ty; extra == "test"
|
|
29
|
+
Requires-Dist: ruff; extra == "test"
|
|
30
|
+
Provides-Extra: dist
|
|
31
|
+
Requires-Dist: build>=1.2.2; extra == "dist"
|
|
32
|
+
Requires-Dist: twine>=6.1.0; extra == "dist"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# Logseq Retriever
|
|
36
|
+
Python library for loading and retrieving Logseq documents.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
## Components
|
|
40
|
+
This section provides an overview of the components provided, listed by type
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
### Retrievers
|
|
44
|
+
Retrievers inject context into a conversation. Works in tandem with a Contextualizer and `Document` Loader.
|
|
45
|
+
- **Input**:
|
|
46
|
+
- natural-language user-input, usually query-like
|
|
47
|
+
- (optional) chat history
|
|
48
|
+
- **Output**:
|
|
49
|
+
- list of `Document`s to provide context for an LLM to answer the user-input
|
|
50
|
+
|
|
51
|
+
#### Implementations
|
|
52
|
+
- `LogseqJournalDateRangeRetriever`
|
|
53
|
+
- retrieve Logseq journal `Document`s, intended for queries that require context from a date range
|
|
54
|
+
- required to set up:
|
|
55
|
+
- `RetrieverContextualizer`
|
|
56
|
+
- `LogseqJournalLoader`
|
|
57
|
+
- examples:
|
|
58
|
+
- "What did I do over Christmas break 2024?"
|
|
59
|
+
- "How did I spend the last Independence Day?"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
### Contextualizers
|
|
63
|
+
Contextualizers serve as the bridge between natural-language input and a downstream component that
|
|
64
|
+
handles fetching of relevant `Document`s.
|
|
65
|
+
- **Input**:
|
|
66
|
+
- natural-language user-input, usually query-like
|
|
67
|
+
- (optional) chat history
|
|
68
|
+
- **Output**:
|
|
69
|
+
- structured downstream query, based on
|
|
70
|
+
|
|
71
|
+
In this library, an instance of `RetrieverContextualizer` is provided directly to
|
|
72
|
+
`Retriever`s during the latter's instantiation. To set up the `RetrieverContextualizer`, provide
|
|
73
|
+
`RetrieverContextualizerProps`, which includes:
|
|
74
|
+
- `llm` - this is the backbone of the contextualizer
|
|
75
|
+
- `prompt` - instructions provided to the LLM
|
|
76
|
+
- `output_schema` - (optional) structured schema used to fetch relevant `Document`s
|
|
77
|
+
- if no schema is provided, a string shall be returned instead
|
|
78
|
+
- other flags and settings
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
### Loaders
|
|
82
|
+
Loaders are one type of component that can fetch relevant `Document`s. Loaders are typically specific to
|
|
83
|
+
a corresponding `Retriever` component.
|
|
84
|
+
- **Input**:
|
|
85
|
+
- each loader specifies its own schema
|
|
86
|
+
- the Contextualizer is usually responsible for creating an instance of the query obj to act upon
|
|
87
|
+
- **Output**:
|
|
88
|
+
- `list[Document]`
|
|
89
|
+
|
|
90
|
+
#### Implementations
|
|
91
|
+
- `LogseqJournalFilesystemLoader`
|
|
92
|
+
- loads from the filesystem, where journal files are expected to be present at specified path
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
## Scripts
|
|
97
|
+
|
|
98
|
+
### PGVector
|
|
99
|
+
|
|
100
|
+
#### `upload_journal`
|
|
101
|
+
|
|
102
|
+
usage: `python scripts/upload_journal_to_pgvector.py [-h] [-p PATH] from_date to_date`
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Logseq Retriever
|
|
2
|
+
Python library for loading and retrieving Logseq documents.
|
|
3
|
+
|
|
4
|
+
---
|
|
5
|
+
## Components
|
|
6
|
+
This section provides an overview of the components provided, listed by type
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
### Retrievers
|
|
10
|
+
Retrievers inject context into a conversation. Works in tandem with a Contextualizer and `Document` Loader.
|
|
11
|
+
- **Input**:
|
|
12
|
+
- natural-language user-input, usually query-like
|
|
13
|
+
- (optional) chat history
|
|
14
|
+
- **Output**:
|
|
15
|
+
- list of `Document`s to provide context for an LLM to answer the user-input
|
|
16
|
+
|
|
17
|
+
#### Implementations
|
|
18
|
+
- `LogseqJournalDateRangeRetriever`
|
|
19
|
+
- retrieve Logseq journal `Document`s, intended for queries that require context from a date range
|
|
20
|
+
- required to set up:
|
|
21
|
+
- `RetrieverContextualizer`
|
|
22
|
+
- `LogseqJournalLoader`
|
|
23
|
+
- examples:
|
|
24
|
+
- "What did I do over Christmas break 2024?"
|
|
25
|
+
- "How did I spend the last Independence Day?"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
### Contextualizers
|
|
29
|
+
Contextualizers serve as the bridge between natural-language input and a downstream component that
|
|
30
|
+
handles fetching of relevant `Document`s.
|
|
31
|
+
- **Input**:
|
|
32
|
+
- natural-language user-input, usually query-like
|
|
33
|
+
- (optional) chat history
|
|
34
|
+
- **Output**:
|
|
35
|
+
- structured downstream query, based on
|
|
36
|
+
|
|
37
|
+
In this library, an instance of `RetrieverContextualizer` is provided directly to
|
|
38
|
+
`Retriever`s during the latter's instantiation. To set up the `RetrieverContextualizer`, provide
|
|
39
|
+
`RetrieverContextualizerProps`, which includes:
|
|
40
|
+
- `llm` - this is the backbone of the contextualizer
|
|
41
|
+
- `prompt` - instructions provided to the LLM
|
|
42
|
+
- `output_schema` - (optional) structured schema used to fetch relevant `Document`s
|
|
43
|
+
- if no schema is provided, a string shall be returned instead
|
|
44
|
+
- other flags and settings
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
### Loaders
|
|
48
|
+
Loaders are one type of component that can fetch relevant `Document`s. Loaders are typically specific to
|
|
49
|
+
a corresponding `Retriever` component.
|
|
50
|
+
- **Input**:
|
|
51
|
+
- each loader specifies its own schema
|
|
52
|
+
- the Contextualizer is usually responsible for creating an instance of the query obj to act upon
|
|
53
|
+
- **Output**:
|
|
54
|
+
- `list[Document]`
|
|
55
|
+
|
|
56
|
+
#### Implementations
|
|
57
|
+
- `LogseqJournalFilesystemLoader`
|
|
58
|
+
- loads from the filesystem, where journal files are expected to be present at specified path
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
## Scripts
|
|
63
|
+
|
|
64
|
+
### PGVector
|
|
65
|
+
|
|
66
|
+
#### `upload_journal`
|
|
67
|
+
|
|
68
|
+
usage: `python scripts/upload_journal_to_pgvector.py [-h] [-p PATH] from_date to_date`
|
|
File without changes
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from logseq_retriever.loaders.journal_document_metadata import (
|
|
2
|
+
LogseqJournalDocumentMetadata,
|
|
3
|
+
)
|
|
4
|
+
from logseq_retriever.loaders.journal_filesystem_loader import (
|
|
5
|
+
LogseqJournalFilesystemLoader,
|
|
6
|
+
)
|
|
7
|
+
from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
|
|
8
|
+
from logseq_retriever.loaders.journal_loader import LogseqJournalLoader
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"LogseqJournalDocumentMetadata",
|
|
12
|
+
"LogseqJournalFilesystemLoader",
|
|
13
|
+
"LogseqJournalLoaderInput",
|
|
14
|
+
"LogseqJournalLoader",
|
|
15
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LogseqJournalDocumentMetadata(BaseModel):
|
|
7
|
+
"""
|
|
8
|
+
Metadata for a Logseq journal `Document`.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
journal_date: Annotated[
|
|
12
|
+
str,
|
|
13
|
+
Field(
|
|
14
|
+
description="The date of the journal entry, in YYYY-MM-DD format.",
|
|
15
|
+
examples=["2023-01-01", "2025-06-09"],
|
|
16
|
+
),
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
journal_tags: Annotated[
|
|
20
|
+
list[str],
|
|
21
|
+
Field(
|
|
22
|
+
description="The tags associated with the journal entry.",
|
|
23
|
+
examples=[["tag1", "tag2"], ["tag3"]],
|
|
24
|
+
),
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
journal_char_count: Annotated[
|
|
28
|
+
int,
|
|
29
|
+
Field(
|
|
30
|
+
description="The number of characters in the journal entry.",
|
|
31
|
+
),
|
|
32
|
+
]
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
from logging import getLogger
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from logseq_retriever.loaders.journal_loader import LogseqJournalLoader
|
|
7
|
+
from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
|
|
8
|
+
from logseq_retriever.loaders.journal_document_metadata import (
|
|
9
|
+
LogseqJournalDocumentMetadata,
|
|
10
|
+
)
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LogseqJournalFilesystemLoader(LogseqJournalLoader):
|
|
18
|
+
"""
|
|
19
|
+
Based on input, load a collection of Logseq journal files from the filesystem, as
|
|
20
|
+
Langchain `Document`s.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
logseq_journal_path: str,
|
|
26
|
+
**kwargs,
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the loader with the path to the Logseq journal directory.
|
|
30
|
+
`logseq_journal_path` should be contain Logesq journal files, such as `2025_03_27.md`
|
|
31
|
+
"""
|
|
32
|
+
self.logseq_journal_path = logseq_journal_path
|
|
33
|
+
self._validate_logseq_journal_path()
|
|
34
|
+
|
|
35
|
+
def load( # type: ignore[override]
|
|
36
|
+
self,
|
|
37
|
+
input: LogseqJournalLoaderInput,
|
|
38
|
+
) -> list[Document]:
|
|
39
|
+
"""
|
|
40
|
+
Synchronously load the documents from the Logseq journal directory, according to the input.
|
|
41
|
+
"""
|
|
42
|
+
# Convert dates to datetime objects once
|
|
43
|
+
if input.start_date > input.end_date:
|
|
44
|
+
raise ValueError("journal_end_date must be after journal_start_date")
|
|
45
|
+
|
|
46
|
+
documents: list[Document] = []
|
|
47
|
+
# TODO this glob pattern can be improved by analyzing start_date & end_date to provide fewer matches
|
|
48
|
+
for path in Path(self.logseq_journal_path).glob("*.md"):
|
|
49
|
+
filename = path.name
|
|
50
|
+
if self._match_journal(filename, input.start_date, input.end_date):
|
|
51
|
+
file_path = os.path.join(self.logseq_journal_path, filename)
|
|
52
|
+
with open(file_path, "r") as file:
|
|
53
|
+
content = file.read()
|
|
54
|
+
documents.extend(
|
|
55
|
+
self.__class__.parse_journal_markdown_file(
|
|
56
|
+
content, filename, input.enable_splitting
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
return documents
|
|
60
|
+
|
|
61
|
+
def _validate_logseq_journal_path(self):
|
|
62
|
+
"""
|
|
63
|
+
Validate the path to the Logseq journal directory. Check that the directory exists.
|
|
64
|
+
If the directory is empty, or does not contain files with the expected format, log a warning.
|
|
65
|
+
"""
|
|
66
|
+
# verify that the path exist, and is a directory
|
|
67
|
+
if not os.path.exists(self.logseq_journal_path):
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Logseq journal path does not exist: {self.logseq_journal_path}"
|
|
70
|
+
)
|
|
71
|
+
if not os.path.isdir(self.logseq_journal_path):
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Logseq journal path is not a directory: {self.logseq_journal_path}"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# verify that the directory contains files with the expected format
|
|
77
|
+
files = os.listdir(self.logseq_journal_path)
|
|
78
|
+
if len(files) == 0:
|
|
79
|
+
logger.warning(
|
|
80
|
+
f"Logseq journal directory is empty: {self.logseq_journal_path}"
|
|
81
|
+
)
|
|
82
|
+
files = Path(self.logseq_journal_path).glob("*.md")
|
|
83
|
+
if not len(list(files)) > 0:
|
|
84
|
+
logger.warning(
|
|
85
|
+
f"No files with .md extension found in {self.logseq_journal_path}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def _match_journal(self, filename: str, start_date: date, end_date: date) -> bool:
|
|
89
|
+
"""
|
|
90
|
+
Return `True` if journal date is between `start_date` & `end_date`.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
filename: The journal filename (e.g., "2025_03_27.md")
|
|
94
|
+
start_date: The start date as a datetime object
|
|
95
|
+
end_date: The end date as a datetime object
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
bool: True if the file's date is within the range, False otherwise
|
|
99
|
+
"""
|
|
100
|
+
if not filename.endswith(".md"):
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Convert filename to date object
|
|
105
|
+
file_date = datetime.strptime(filename[:-3], "%Y_%m_%d").date()
|
|
106
|
+
return start_date <= file_date <= end_date
|
|
107
|
+
except ValueError:
|
|
108
|
+
# If there's any issue parsing the date from filename, skip this file
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def parse_journal_markdown_file(
|
|
113
|
+
content: str, filename: str, enable_splitting: bool = True
|
|
114
|
+
) -> list[Document]:
|
|
115
|
+
"""
|
|
116
|
+
Generate `Document`s from a file's contents. If necessary, split content into digestible
|
|
117
|
+
`Document`s, and attach metadata.
|
|
118
|
+
This function can potentially be augmented by calling Logseq APIs, rather than simply parsing markdown files.
|
|
119
|
+
"""
|
|
120
|
+
sections = content.split("\n- ") if enable_splitting else [content]
|
|
121
|
+
docs = []
|
|
122
|
+
for section in sections:
|
|
123
|
+
if section_content := section.strip():
|
|
124
|
+
# Create a Document
|
|
125
|
+
# first, check that the content length (char count) is acceptable
|
|
126
|
+
# if longer than acceptable, then call recursively
|
|
127
|
+
# TODO: use self.p.max_char_count below instead
|
|
128
|
+
metadata = (
|
|
129
|
+
LogseqJournalFilesystemLoader.parse_journal_markdown_file_metadata(
|
|
130
|
+
section_content, filename
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
docs.append(
|
|
134
|
+
Document(
|
|
135
|
+
page_content=section_content, metadata=metadata.model_dump()
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
return docs
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def parse_journal_markdown_file_metadata(
|
|
142
|
+
section: str, filename: str
|
|
143
|
+
) -> LogseqJournalDocumentMetadata:
|
|
144
|
+
"""
|
|
145
|
+
Parse metadata from a journal markdown file. Return `LogseqMarkdownDocumentMetadata`.
|
|
146
|
+
This function can potentially be augmented by calling Logseq APIs, rather than simply parsing markdown files.
|
|
147
|
+
"""
|
|
148
|
+
# Extract date from filename
|
|
149
|
+
date_str = filename.replace(".md", "").replace("_", "-")
|
|
150
|
+
char_count = len(section)
|
|
151
|
+
|
|
152
|
+
return LogseqJournalDocumentMetadata(
|
|
153
|
+
journal_date=date_str,
|
|
154
|
+
# TODO get tags from Document's contents
|
|
155
|
+
journal_tags=[],
|
|
156
|
+
journal_char_count=char_count,
|
|
157
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from langchain_core.document_loaders import BaseLoader
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LogseqJournalLoader(BaseLoader):
|
|
8
|
+
"""
|
|
9
|
+
Base class for loading Logseq journal files.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def load(self, input: Any) -> list[Document]: # type: ignore[override] # ty: ignore[invalid-method-override]
|
|
13
|
+
raise NotImplementedError("This method should be implemented by subclasses.")
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from datetime import datetime, date
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
from pydantic import (
|
|
5
|
+
BaseModel,
|
|
6
|
+
Field,
|
|
7
|
+
AfterValidator,
|
|
8
|
+
computed_field,
|
|
9
|
+
PrivateAttr,
|
|
10
|
+
model_validator,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _validate_date_format(value: str) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Normalize date string to ISO format (YYYY-MM-DD).
|
|
17
|
+
Accepts dates like '2023-3-1' and converts to '2023-03-01'.
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
_parse_date(value)
|
|
21
|
+
return value
|
|
22
|
+
except ValueError:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
f"Invalid date: '{value}'. Expecting ISO-8601 format: YYYY-MM-DD"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _parse_date(date_str: str) -> date:
|
|
29
|
+
"""
|
|
30
|
+
Parse a date string into a datetime.date object.
|
|
31
|
+
"""
|
|
32
|
+
return datetime.strptime(date_str, "%Y-%m-%d").date()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class LogseqJournalLoaderInput(BaseModel):
|
|
36
|
+
"""
|
|
37
|
+
Input for a Logseq journal `Document` loader, to invoke a load.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
journal_start_date: Annotated[
|
|
41
|
+
str,
|
|
42
|
+
Field(
|
|
43
|
+
description="The start date of the journal to load, in YYYY-MM-DD format.",
|
|
44
|
+
examples=["2023-01-01", "2025-06-09"],
|
|
45
|
+
),
|
|
46
|
+
AfterValidator(_validate_date_format),
|
|
47
|
+
]
|
|
48
|
+
journal_end_date: Annotated[
|
|
49
|
+
str,
|
|
50
|
+
Field(
|
|
51
|
+
description="The end date of the journal to load, in YYYY-MM-DD format.",
|
|
52
|
+
examples=["2023-01-01", "2025-06-09"],
|
|
53
|
+
),
|
|
54
|
+
AfterValidator(_validate_date_format),
|
|
55
|
+
]
|
|
56
|
+
max_char_length: Annotated[
|
|
57
|
+
int,
|
|
58
|
+
Field(
|
|
59
|
+
description="The maximum number of characters to include in a single `Document`.",
|
|
60
|
+
examples=[8196, 2000],
|
|
61
|
+
default=1024 * 8,
|
|
62
|
+
),
|
|
63
|
+
] = 1024 * 8
|
|
64
|
+
enable_splitting: Annotated[
|
|
65
|
+
bool,
|
|
66
|
+
Field(
|
|
67
|
+
description="Whether to split the journal file into multiple `Document`s.",
|
|
68
|
+
examples=[True, False],
|
|
69
|
+
default=True,
|
|
70
|
+
),
|
|
71
|
+
] = True
|
|
72
|
+
|
|
73
|
+
# Private attributes that won't be included in model_dump
|
|
74
|
+
_start_date: date = PrivateAttr()
|
|
75
|
+
_end_date: date = PrivateAttr()
|
|
76
|
+
|
|
77
|
+
@model_validator(mode="after")
|
|
78
|
+
def _parse_dates(self) -> "LogseqJournalLoaderInput":
|
|
79
|
+
"""Parse date strings into date objects after validation."""
|
|
80
|
+
self._start_date = _parse_date(self.journal_start_date)
|
|
81
|
+
self._end_date = _parse_date(self.journal_end_date)
|
|
82
|
+
return self
|
|
83
|
+
|
|
84
|
+
@computed_field
|
|
85
|
+
@property
|
|
86
|
+
def start_date(self) -> date:
|
|
87
|
+
"""Get `journal_start_date` as a date object."""
|
|
88
|
+
return self._start_date
|
|
89
|
+
|
|
90
|
+
@computed_field
|
|
91
|
+
@property
|
|
92
|
+
def end_date(self) -> date:
|
|
93
|
+
"""Get `journal_end_date` as a date object."""
|
|
94
|
+
return self._end_date
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# debugging only
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
from pprint import pprint
|
|
100
|
+
|
|
101
|
+
pprint(LogseqJournalLoaderInput.model_json_schema())
|
|
102
|
+
|
|
103
|
+
example = LogseqJournalLoaderInput(
|
|
104
|
+
journal_start_date="2023-01-01",
|
|
105
|
+
journal_end_date="2023-01-02",
|
|
106
|
+
max_char_length=1024 * 4,
|
|
107
|
+
)
|
|
108
|
+
print(example.model_dump())
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from logseq_retriever.models.journal_pgvector import (
|
|
2
|
+
JournalDocument,
|
|
3
|
+
JournalCorpusMetadata,
|
|
4
|
+
JournalDocumentMetadata,
|
|
5
|
+
JournalSearchClientConfig,
|
|
6
|
+
JournalSearchQuery,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"JournalDocument",
|
|
11
|
+
"JournalCorpusMetadata",
|
|
12
|
+
"JournalDocumentMetadata",
|
|
13
|
+
"JournalSearchClientConfig",
|
|
14
|
+
"JournalSearchQuery",
|
|
15
|
+
]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import Type
|
|
2
|
+
|
|
3
|
+
from pgvector.sqlalchemy import Vector
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
from sqlalchemy import Column, String
|
|
6
|
+
|
|
7
|
+
from pgvector_template.core import (
|
|
8
|
+
BaseDocument,
|
|
9
|
+
BaseDocumentMetadata,
|
|
10
|
+
BaseSearchClientConfig,
|
|
11
|
+
)
|
|
12
|
+
from pgvector_template.models.search import (
|
|
13
|
+
SearchQuery,
|
|
14
|
+
MetadataFilter,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class JournalDocument(BaseDocument):
|
|
19
|
+
"""
|
|
20
|
+
Each `Corpus` is the entire entry for a given date. A corpus may consist of 1 or more chunks of `Document`s.
|
|
21
|
+
Each `Corpus` has a set of metadata, and each `Document` chunk has all of those, plus more.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
__abstract__ = False
|
|
25
|
+
__tablename__ = "logseq_journal"
|
|
26
|
+
|
|
27
|
+
corpus_id = Column(String(len("2025-06-09")), index=True)
|
|
28
|
+
"""Length of ISO date string"""
|
|
29
|
+
embedding = Column(Vector(1024))
|
|
30
|
+
"""Embedding vector"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class JournalCorpusMetadata(BaseDocumentMetadata):
|
|
34
|
+
"""Metadata schema for Logseq journal corpora. Consist of 1-or-more chunks, called `Document`s."""
|
|
35
|
+
|
|
36
|
+
# corpus
|
|
37
|
+
date_str: str = Field(
|
|
38
|
+
pattern=r"^\d{4}-\d{2}-\d{2}$",
|
|
39
|
+
description="Date in ISO format, e.g. `2025-04-20`",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# defaults
|
|
43
|
+
document_type: str = Field(default="logseq_journal")
|
|
44
|
+
schema_version: str = Field(default="2025-07-10")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class JournalDocumentMetadata(JournalCorpusMetadata):
|
|
48
|
+
"""Metadata schema for Logseq journal `Document`s. 1-or-more `Document`s make up a corpus."""
|
|
49
|
+
|
|
50
|
+
# chunk/document
|
|
51
|
+
chunk_len: int = Field()
|
|
52
|
+
"""Length of the content in characters"""
|
|
53
|
+
word_count: int | None = Field()
|
|
54
|
+
"""Length of the content in words"""
|
|
55
|
+
references: list[str] = Field(default=[])
|
|
56
|
+
"""List of references to other Logseq documents, or journal dates"""
|
|
57
|
+
anchor_ids: list[str] = Field(default=[])
|
|
58
|
+
"""Blocks in the document can have UUID anchors, which are referenced elsewhere. This is a list of all present"""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class JournalSearchClientConfig(BaseSearchClientConfig):
|
|
62
|
+
"""Configuration for the Logseq journal search client."""
|
|
63
|
+
|
|
64
|
+
document_cls: Type[BaseDocument] = JournalDocument
|
|
65
|
+
"""The document type to use for the search client."""
|
|
66
|
+
document_metadata_cls: Type[BaseDocumentMetadata] = JournalDocumentMetadata
|
|
67
|
+
"""The document metadata type to use for the search client."""
|
|
68
|
+
# embedding_provider
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class JournalSearchQuery(SearchQuery):
|
|
72
|
+
"""
|
|
73
|
+
Standardized search query structure, specifically for searching Logseq `JournalDocument`s.
|
|
74
|
+
At least 1 search criterion is required (text, keywords, metadata_filters), but multiple are allowed.
|
|
75
|
+
Types are the same as in `SearchQuery`.
|
|
76
|
+
Descriptions are customized to better suit Logseq `JournalDocument`'s.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
text: str | None = None
|
|
80
|
+
"""
|
|
81
|
+
String to match against using in a semantic search, i.e. using vector distance.
|
|
82
|
+
Instead of passing in a question, rephrase the question to be a string/phrase matching closer
|
|
83
|
+
to the content expected to be found.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
keywords: list[str] = []
|
|
87
|
+
"""
|
|
88
|
+
List of keywords to **exact-match**.
|
|
89
|
+
If any keywords are provided, at least 1 keyword must appear in the content,
|
|
90
|
+
so use only if certain that the word will appear.
|
|
91
|
+
Do not include keywords that can be covered in metadata_filters, e.g. dates, document type.
|
|
92
|
+
If you are not certain that a word will appear, try using `text` for a semantic search instead.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
metadata_filters: list[MetadataFilter] = Field(
|
|
96
|
+
default=[],
|
|
97
|
+
json_schema_extra={
|
|
98
|
+
"metadata_schema": JournalDocumentMetadata.model_json_schema()
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
"""
|
|
102
|
+
List of metadata conditions that must be matched.
|
|
103
|
+
Refer to `metadata_schema` for the expected schema, as it exists in the database.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
limit: int = Field(20, ge=3)
|
|
107
|
+
"""Maximum number of results to return."""
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from logseq_retriever.retrievers.contextualizer import (
|
|
2
|
+
RetrieverContextualizerProps,
|
|
3
|
+
RetrieverContextualizer,
|
|
4
|
+
)
|
|
5
|
+
from logseq_retriever.retrievers.journal_retriever import LogseqJournalRetriever
|
|
6
|
+
from logseq_retriever.retrievers.journal_date_range_retriever import (
|
|
7
|
+
LogseqJournalDateRangeRetriever,
|
|
8
|
+
)
|
|
9
|
+
from logseq_retriever.retrievers.pgvector_journal_retriever import (
|
|
10
|
+
PGVectorJournalRetriever,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"RetrieverContextualizerProps",
|
|
15
|
+
"RetrieverContextualizer",
|
|
16
|
+
"LogseqJournalRetriever",
|
|
17
|
+
"LogseqJournalDateRangeRetriever",
|
|
18
|
+
"PGVectorJournalRetriever",
|
|
19
|
+
]
|