logseq-retriever 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. logseq_retriever-0.4.1/LICENSE +21 -0
  2. logseq_retriever-0.4.1/PKG-INFO +102 -0
  3. logseq_retriever-0.4.1/README.md +68 -0
  4. logseq_retriever-0.4.1/logseq_retriever/__init__.py +0 -0
  5. logseq_retriever-0.4.1/logseq_retriever/loaders/__init__.py +15 -0
  6. logseq_retriever-0.4.1/logseq_retriever/loaders/journal_document_metadata.py +32 -0
  7. logseq_retriever-0.4.1/logseq_retriever/loaders/journal_filesystem_loader.py +157 -0
  8. logseq_retriever-0.4.1/logseq_retriever/loaders/journal_loader.py +13 -0
  9. logseq_retriever-0.4.1/logseq_retriever/loaders/journal_loader_input.py +108 -0
  10. logseq_retriever-0.4.1/logseq_retriever/models/__init__.py +15 -0
  11. logseq_retriever-0.4.1/logseq_retriever/models/journal_pgvector.py +107 -0
  12. logseq_retriever-0.4.1/logseq_retriever/retrievers/__init__.py +19 -0
  13. logseq_retriever-0.4.1/logseq_retriever/retrievers/contextualizer.py +172 -0
  14. logseq_retriever-0.4.1/logseq_retriever/retrievers/journal_date_range_retriever.py +79 -0
  15. logseq_retriever-0.4.1/logseq_retriever/retrievers/journal_retriever.py +105 -0
  16. logseq_retriever-0.4.1/logseq_retriever/retrievers/pgvector_journal_retriever.py +95 -0
  17. logseq_retriever-0.4.1/logseq_retriever/uploaders/__init__.py +0 -0
  18. logseq_retriever-0.4.1/logseq_retriever/uploaders/pgvector/__init__.py +9 -0
  19. logseq_retriever-0.4.1/logseq_retriever/uploaders/pgvector/journal_corpus_manager.py +72 -0
  20. logseq_retriever-0.4.1/logseq_retriever.egg-info/PKG-INFO +102 -0
  21. logseq_retriever-0.4.1/logseq_retriever.egg-info/SOURCES.txt +24 -0
  22. logseq_retriever-0.4.1/logseq_retriever.egg-info/dependency_links.txt +1 -0
  23. logseq_retriever-0.4.1/logseq_retriever.egg-info/requires.txt +23 -0
  24. logseq_retriever-0.4.1/logseq_retriever.egg-info/top_level.txt +1 -0
  25. logseq_retriever-0.4.1/pyproject.toml +57 -0
  26. logseq_retriever-0.4.1/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 David Ge Liu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,102 @@
1
+ Metadata-Version: 2.4
2
+ Name: logseq-retriever
3
+ Version: 0.4.1
4
+ Summary: Python library for loading and retrieving Logseq documents
5
+ Author-email: DL <v49t9zpqd@mozmail.com>
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.11
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: langchain<0.4.0,>=0.3.24
14
+ Requires-Dist: pydantic<3.0,>=2.11
15
+ Requires-Dist: pgvector-template>=0.3.4
16
+ Provides-Extra: scripts
17
+ Requires-Dist: python-dotenv; extra == "scripts"
18
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "scripts"
19
+ Requires-Dist: langchain-aws>=0.2.0; extra == "scripts"
20
+ Provides-Extra: test
21
+ Requires-Dist: pytest; extra == "test"
22
+ Requires-Dist: pytest-cov; extra == "test"
23
+ Requires-Dist: python-dotenv; extra == "test"
24
+ Requires-Dist: boto3; extra == "test"
25
+ Requires-Dist: boto3-stubs; extra == "test"
26
+ Requires-Dist: langchain-aws>=0.2.0; extra == "test"
27
+ Requires-Dist: psycopg[binary]>=3.0.0; extra == "test"
28
+ Requires-Dist: ty; extra == "test"
29
+ Requires-Dist: ruff; extra == "test"
30
+ Provides-Extra: dist
31
+ Requires-Dist: build>=1.2.2; extra == "dist"
32
+ Requires-Dist: twine>=6.1.0; extra == "dist"
33
+ Dynamic: license-file
34
+
35
+ # Logseq Retriever
36
+ Python library for loading and retrieving Logseq documents.
37
+
38
+ ---
39
+ ## Components
40
+ This section provides an overview of the components provided, listed by type
41
+
42
+
43
+ ### Retrievers
44
+ Retrievers inject context into a conversation. Works in tandem with a Contextualizer and `Document` Loader.
45
+ - **Input**:
46
+ - natural-language user-input, usually query-like
47
+ - (optional) chat history
48
+ - **Output**:
49
+ - list of `Document`s to provide context for an LLM to answer the user-input
50
+
51
+ #### Implementations
52
+ - `LogseqJournalDateRangeRetriever`
53
+ - retrieve Logseq journal `Document`s, intended for queries that require context from a date range
54
+ - required to set up:
55
+ - `RetrieverContextualizer`
56
+ - `LogseqJournalLoader`
57
+ - examples:
58
+ - "What did I do over Christmas break 2024?"
59
+ - "How did I spend the last Independence Day?"
60
+
61
+
62
+ ### Contextualizers
63
+ Contextualizers serve as the bridge between natural-language input and a downstream component that
64
+ handles fetching of relevant `Document`s.
65
+ - **Input**:
66
+ - natural-language user-input, usually query-like
67
+ - (optional) chat history
68
+ - **Output**:
69
+ - structured downstream query, based on
70
+
71
+ In this library, an instance of `RetrieverContextualizer` is provided directly to
72
+ `Retriever`s during the latter's instantiation. To set up the `RetrieverContextualizer`, provide
73
+ `RetrieverContextualizerProps`, which includes:
74
+ - `llm` - this is the backbone of the contextualizer
75
+ - `prompt` - instructions provided to the LLM
76
+ - `output_schema` - (optional) structured schema used to fetch relevant `Document`s
77
+ - if no schema is provided, a string shall be returned instead
78
+ - other flags and settings
79
+
80
+
81
+ ### Loaders
82
+ Loaders are one type of component that can fetch relevant `Document`s. Loaders are typically specific to
83
+ a corresponding `Retriever` component.
84
+ - **Input**:
85
+ - each loader specifies its own schema
86
+ - the Contextualizer is usually responsible for creating an instance of the query obj to act upon
87
+ - **Output**:
88
+ - `list[Document]`
89
+
90
+ #### Implementations
91
+ - `LogseqJournalFilesystemLoader`
92
+ - loads from the filesystem, where journal files are expected to be present at specified path
93
+
94
+
95
+ ---
96
+ ## Scripts
97
+
98
+ ### PGVector
99
+
100
+ #### `upload_journal`
101
+
102
+ usage: `python scripts/upload_journal_to_pgvector.py [-h] [-p PATH] from_date to_date`
@@ -0,0 +1,68 @@
1
+ # Logseq Retriever
2
+ Python library for loading and retrieving Logseq documents.
3
+
4
+ ---
5
+ ## Components
6
+ This section provides an overview of the components provided, listed by type
7
+
8
+
9
+ ### Retrievers
10
+ Retrievers inject context into a conversation. Works in tandem with a Contextualizer and `Document` Loader.
11
+ - **Input**:
12
+ - natural-language user-input, usually query-like
13
+ - (optional) chat history
14
+ - **Output**:
15
+ - list of `Document`s to provide context for an LLM to answer the user-input
16
+
17
+ #### Implementations
18
+ - `LogseqJournalDateRangeRetriever`
19
+ - retrieve Logseq journal `Document`s, intended for queries that require context from a date range
20
+ - required to set up:
21
+ - `RetrieverContextualizer`
22
+ - `LogseqJournalLoader`
23
+ - examples:
24
+ - "What did I do over Christmas break 2024?"
25
+ - "How did I spend the last Independence Day?"
26
+
27
+
28
+ ### Contextualizers
29
+ Contextualizers serve as the bridge between natural-language input and a downstream component that
30
+ handles fetching of relevant `Document`s.
31
+ - **Input**:
32
+ - natural-language user-input, usually query-like
33
+ - (optional) chat history
34
+ - **Output**:
35
+ - structured downstream query, based on
36
+
37
+ In this library, an instance of `RetrieverContextualizer` is provided directly to
38
+ `Retriever`s during the latter's instantiation. To set up the `RetrieverContextualizer`, provide
39
+ `RetrieverContextualizerProps`, which includes:
40
+ - `llm` - this is the backbone of the contextualizer
41
+ - `prompt` - instructions provided to the LLM
42
+ - `output_schema` - (optional) structured schema used to fetch relevant `Document`s
43
+ - if no schema is provided, a string shall be returned instead
44
+ - other flags and settings
45
+
46
+
47
+ ### Loaders
48
+ Loaders are one type of component that can fetch relevant `Document`s. Loaders are typically specific to
49
+ a corresponding `Retriever` component.
50
+ - **Input**:
51
+ - each loader specifies its own schema
52
+ - the Contextualizer is usually responsible for creating an instance of the query obj to act upon
53
+ - **Output**:
54
+ - `list[Document]`
55
+
56
+ #### Implementations
57
+ - `LogseqJournalFilesystemLoader`
58
+ - loads from the filesystem, where journal files are expected to be present at specified path
59
+
60
+
61
+ ---
62
+ ## Scripts
63
+
64
+ ### PGVector
65
+
66
+ #### `upload_journal`
67
+
68
+ usage: `python scripts/upload_journal_to_pgvector.py [-h] [-p PATH] from_date to_date`
File without changes
@@ -0,0 +1,15 @@
1
+ from logseq_retriever.loaders.journal_document_metadata import (
2
+ LogseqJournalDocumentMetadata,
3
+ )
4
+ from logseq_retriever.loaders.journal_filesystem_loader import (
5
+ LogseqJournalFilesystemLoader,
6
+ )
7
+ from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
8
+ from logseq_retriever.loaders.journal_loader import LogseqJournalLoader
9
+
10
+ __all__ = [
11
+ "LogseqJournalDocumentMetadata",
12
+ "LogseqJournalFilesystemLoader",
13
+ "LogseqJournalLoaderInput",
14
+ "LogseqJournalLoader",
15
+ ]
@@ -0,0 +1,32 @@
1
+ from typing import Annotated
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class LogseqJournalDocumentMetadata(BaseModel):
7
+ """
8
+ Metadata for a Logseq journal `Document`.
9
+ """
10
+
11
+ journal_date: Annotated[
12
+ str,
13
+ Field(
14
+ description="The date of the journal entry, in YYYY-MM-DD format.",
15
+ examples=["2023-01-01", "2025-06-09"],
16
+ ),
17
+ ]
18
+
19
+ journal_tags: Annotated[
20
+ list[str],
21
+ Field(
22
+ description="The tags associated with the journal entry.",
23
+ examples=[["tag1", "tag2"], ["tag3"]],
24
+ ),
25
+ ]
26
+
27
+ journal_char_count: Annotated[
28
+ int,
29
+ Field(
30
+ description="The number of characters in the journal entry.",
31
+ ),
32
+ ]
@@ -0,0 +1,157 @@
1
+ from datetime import date, datetime
2
+ from logging import getLogger
3
+ from pathlib import Path
4
+
5
+ from langchain_core.documents import Document
6
+ from logseq_retriever.loaders.journal_loader import LogseqJournalLoader
7
+ from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
8
+ from logseq_retriever.loaders.journal_document_metadata import (
9
+ LogseqJournalDocumentMetadata,
10
+ )
11
+ import os
12
+
13
+
14
+ logger = getLogger(__name__)
15
+
16
+
17
+ class LogseqJournalFilesystemLoader(LogseqJournalLoader):
18
+ """
19
+ Based on input, load a collection of Logseq journal files from the filesystem, as
20
+ Langchain `Document`s.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ logseq_journal_path: str,
26
+ **kwargs,
27
+ ):
28
+ """
29
+ Initialize the loader with the path to the Logseq journal directory.
30
+ `logseq_journal_path` should be contain Logesq journal files, such as `2025_03_27.md`
31
+ """
32
+ self.logseq_journal_path = logseq_journal_path
33
+ self._validate_logseq_journal_path()
34
+
35
+ def load( # type: ignore[override]
36
+ self,
37
+ input: LogseqJournalLoaderInput,
38
+ ) -> list[Document]:
39
+ """
40
+ Synchronously load the documents from the Logseq journal directory, according to the input.
41
+ """
42
+ # Convert dates to datetime objects once
43
+ if input.start_date > input.end_date:
44
+ raise ValueError("journal_end_date must be after journal_start_date")
45
+
46
+ documents: list[Document] = []
47
+ # TODO this glob pattern can be improved by analyzing start_date & end_date to provide fewer matches
48
+ for path in Path(self.logseq_journal_path).glob("*.md"):
49
+ filename = path.name
50
+ if self._match_journal(filename, input.start_date, input.end_date):
51
+ file_path = os.path.join(self.logseq_journal_path, filename)
52
+ with open(file_path, "r") as file:
53
+ content = file.read()
54
+ documents.extend(
55
+ self.__class__.parse_journal_markdown_file(
56
+ content, filename, input.enable_splitting
57
+ )
58
+ )
59
+ return documents
60
+
61
+ def _validate_logseq_journal_path(self):
62
+ """
63
+ Validate the path to the Logseq journal directory. Check that the directory exists.
64
+ If the directory is empty, or does not contain files with the expected format, log a warning.
65
+ """
66
+ # verify that the path exist, and is a directory
67
+ if not os.path.exists(self.logseq_journal_path):
68
+ raise ValueError(
69
+ f"Logseq journal path does not exist: {self.logseq_journal_path}"
70
+ )
71
+ if not os.path.isdir(self.logseq_journal_path):
72
+ raise ValueError(
73
+ f"Logseq journal path is not a directory: {self.logseq_journal_path}"
74
+ )
75
+
76
+ # verify that the directory contains files with the expected format
77
+ files = os.listdir(self.logseq_journal_path)
78
+ if len(files) == 0:
79
+ logger.warning(
80
+ f"Logseq journal directory is empty: {self.logseq_journal_path}"
81
+ )
82
+ files = Path(self.logseq_journal_path).glob("*.md")
83
+ if not len(list(files)) > 0:
84
+ logger.warning(
85
+ f"No files with .md extension found in {self.logseq_journal_path}"
86
+ )
87
+
88
+ def _match_journal(self, filename: str, start_date: date, end_date: date) -> bool:
89
+ """
90
+ Return `True` if journal date is between `start_date` & `end_date`.
91
+
92
+ Args:
93
+ filename: The journal filename (e.g., "2025_03_27.md")
94
+ start_date: The start date as a datetime object
95
+ end_date: The end date as a datetime object
96
+
97
+ Returns:
98
+ bool: True if the file's date is within the range, False otherwise
99
+ """
100
+ if not filename.endswith(".md"):
101
+ return False
102
+
103
+ try:
104
+ # Convert filename to date object
105
+ file_date = datetime.strptime(filename[:-3], "%Y_%m_%d").date()
106
+ return start_date <= file_date <= end_date
107
+ except ValueError:
108
+ # If there's any issue parsing the date from filename, skip this file
109
+ return False
110
+
111
+ @staticmethod
112
+ def parse_journal_markdown_file(
113
+ content: str, filename: str, enable_splitting: bool = True
114
+ ) -> list[Document]:
115
+ """
116
+ Generate `Document`s from a file's contents. If necessary, split content into digestible
117
+ `Document`s, and attach metadata.
118
+ This function can potentially be augmented by calling Logseq APIs, rather than simply parsing markdown files.
119
+ """
120
+ sections = content.split("\n- ") if enable_splitting else [content]
121
+ docs = []
122
+ for section in sections:
123
+ if section_content := section.strip():
124
+ # Create a Document
125
+ # first, check that the content length (char count) is acceptable
126
+ # if longer than acceptable, then call recursively
127
+ # TODO: use self.p.max_char_count below instead
128
+ metadata = (
129
+ LogseqJournalFilesystemLoader.parse_journal_markdown_file_metadata(
130
+ section_content, filename
131
+ )
132
+ )
133
+ docs.append(
134
+ Document(
135
+ page_content=section_content, metadata=metadata.model_dump()
136
+ )
137
+ )
138
+ return docs
139
+
140
+ @staticmethod
141
+ def parse_journal_markdown_file_metadata(
142
+ section: str, filename: str
143
+ ) -> LogseqJournalDocumentMetadata:
144
+ """
145
+ Parse metadata from a journal markdown file. Return `LogseqMarkdownDocumentMetadata`.
146
+ This function can potentially be augmented by calling Logseq APIs, rather than simply parsing markdown files.
147
+ """
148
+ # Extract date from filename
149
+ date_str = filename.replace(".md", "").replace("_", "-")
150
+ char_count = len(section)
151
+
152
+ return LogseqJournalDocumentMetadata(
153
+ journal_date=date_str,
154
+ # TODO get tags from Document's contents
155
+ journal_tags=[],
156
+ journal_char_count=char_count,
157
+ )
@@ -0,0 +1,13 @@
1
+ from typing import Any
2
+
3
+ from langchain_core.document_loaders import BaseLoader
4
+ from langchain_core.documents import Document
5
+
6
+
7
+ class LogseqJournalLoader(BaseLoader):
8
+ """
9
+ Base class for loading Logseq journal files.
10
+ """
11
+
12
+ def load(self, input: Any) -> list[Document]: # type: ignore[override] # ty: ignore[invalid-method-override]
13
+ raise NotImplementedError("This method should be implemented by subclasses.")
@@ -0,0 +1,108 @@
1
+ from datetime import datetime, date
2
+ from typing import Annotated
3
+
4
+ from pydantic import (
5
+ BaseModel,
6
+ Field,
7
+ AfterValidator,
8
+ computed_field,
9
+ PrivateAttr,
10
+ model_validator,
11
+ )
12
+
13
+
14
+ def _validate_date_format(value: str) -> str:
15
+ """
16
+ Normalize date string to ISO format (YYYY-MM-DD).
17
+ Accepts dates like '2023-3-1' and converts to '2023-03-01'.
18
+ """
19
+ try:
20
+ _parse_date(value)
21
+ return value
22
+ except ValueError:
23
+ raise ValueError(
24
+ f"Invalid date: '{value}'. Expecting ISO-8601 format: YYYY-MM-DD"
25
+ )
26
+
27
+
28
+ def _parse_date(date_str: str) -> date:
29
+ """
30
+ Parse a date string into a datetime.date object.
31
+ """
32
+ return datetime.strptime(date_str, "%Y-%m-%d").date()
33
+
34
+
35
+ class LogseqJournalLoaderInput(BaseModel):
36
+ """
37
+ Input for a Logseq journal `Document` loader, to invoke a load.
38
+ """
39
+
40
+ journal_start_date: Annotated[
41
+ str,
42
+ Field(
43
+ description="The start date of the journal to load, in YYYY-MM-DD format.",
44
+ examples=["2023-01-01", "2025-06-09"],
45
+ ),
46
+ AfterValidator(_validate_date_format),
47
+ ]
48
+ journal_end_date: Annotated[
49
+ str,
50
+ Field(
51
+ description="The end date of the journal to load, in YYYY-MM-DD format.",
52
+ examples=["2023-01-01", "2025-06-09"],
53
+ ),
54
+ AfterValidator(_validate_date_format),
55
+ ]
56
+ max_char_length: Annotated[
57
+ int,
58
+ Field(
59
+ description="The maximum number of characters to include in a single `Document`.",
60
+ examples=[8196, 2000],
61
+ default=1024 * 8,
62
+ ),
63
+ ] = 1024 * 8
64
+ enable_splitting: Annotated[
65
+ bool,
66
+ Field(
67
+ description="Whether to split the journal file into multiple `Document`s.",
68
+ examples=[True, False],
69
+ default=True,
70
+ ),
71
+ ] = True
72
+
73
+ # Private attributes that won't be included in model_dump
74
+ _start_date: date = PrivateAttr()
75
+ _end_date: date = PrivateAttr()
76
+
77
+ @model_validator(mode="after")
78
+ def _parse_dates(self) -> "LogseqJournalLoaderInput":
79
+ """Parse date strings into date objects after validation."""
80
+ self._start_date = _parse_date(self.journal_start_date)
81
+ self._end_date = _parse_date(self.journal_end_date)
82
+ return self
83
+
84
+ @computed_field
85
+ @property
86
+ def start_date(self) -> date:
87
+ """Get `journal_start_date` as a date object."""
88
+ return self._start_date
89
+
90
+ @computed_field
91
+ @property
92
+ def end_date(self) -> date:
93
+ """Get `journal_end_date` as a date object."""
94
+ return self._end_date
95
+
96
+
97
+ # debugging only
98
+ if __name__ == "__main__":
99
+ from pprint import pprint
100
+
101
+ pprint(LogseqJournalLoaderInput.model_json_schema())
102
+
103
+ example = LogseqJournalLoaderInput(
104
+ journal_start_date="2023-01-01",
105
+ journal_end_date="2023-01-02",
106
+ max_char_length=1024 * 4,
107
+ )
108
+ print(example.model_dump())
@@ -0,0 +1,15 @@
1
+ from logseq_retriever.models.journal_pgvector import (
2
+ JournalDocument,
3
+ JournalCorpusMetadata,
4
+ JournalDocumentMetadata,
5
+ JournalSearchClientConfig,
6
+ JournalSearchQuery,
7
+ )
8
+
9
+ __all__ = [
10
+ "JournalDocument",
11
+ "JournalCorpusMetadata",
12
+ "JournalDocumentMetadata",
13
+ "JournalSearchClientConfig",
14
+ "JournalSearchQuery",
15
+ ]
@@ -0,0 +1,107 @@
1
+ from typing import Type
2
+
3
+ from pgvector.sqlalchemy import Vector
4
+ from pydantic import Field
5
+ from sqlalchemy import Column, String
6
+
7
+ from pgvector_template.core import (
8
+ BaseDocument,
9
+ BaseDocumentMetadata,
10
+ BaseSearchClientConfig,
11
+ )
12
+ from pgvector_template.models.search import (
13
+ SearchQuery,
14
+ MetadataFilter,
15
+ )
16
+
17
+
18
+ class JournalDocument(BaseDocument):
19
+ """
20
+ Each `Corpus` is the entire entry for a given date. A corpus may consist of 1 or more chunks of `Document`s.
21
+ Each `Corpus` has a set of metadata, and each `Document` chunk has all of those, plus more.
22
+ """
23
+
24
+ __abstract__ = False
25
+ __tablename__ = "logseq_journal"
26
+
27
+ corpus_id = Column(String(len("2025-06-09")), index=True)
28
+ """Length of ISO date string"""
29
+ embedding = Column(Vector(1024))
30
+ """Embedding vector"""
31
+
32
+
33
+ class JournalCorpusMetadata(BaseDocumentMetadata):
34
+ """Metadata schema for Logseq journal corpora. Consist of 1-or-more chunks, called `Document`s."""
35
+
36
+ # corpus
37
+ date_str: str = Field(
38
+ pattern=r"^\d{4}-\d{2}-\d{2}$",
39
+ description="Date in ISO format, e.g. `2025-04-20`",
40
+ )
41
+
42
+ # defaults
43
+ document_type: str = Field(default="logseq_journal")
44
+ schema_version: str = Field(default="2025-07-10")
45
+
46
+
47
+ class JournalDocumentMetadata(JournalCorpusMetadata):
48
+ """Metadata schema for Logseq journal `Document`s. 1-or-more `Document`s make up a corpus."""
49
+
50
+ # chunk/document
51
+ chunk_len: int = Field()
52
+ """Length of the content in characters"""
53
+ word_count: int | None = Field()
54
+ """Length of the content in words"""
55
+ references: list[str] = Field(default=[])
56
+ """List of references to other Logseq documents, or journal dates"""
57
+ anchor_ids: list[str] = Field(default=[])
58
+ """Blocks in the document can have UUID anchors, which are referenced elsewhere. This is a list of all present"""
59
+
60
+
61
+ class JournalSearchClientConfig(BaseSearchClientConfig):
62
+ """Configuration for the Logseq journal search client."""
63
+
64
+ document_cls: Type[BaseDocument] = JournalDocument
65
+ """The document type to use for the search client."""
66
+ document_metadata_cls: Type[BaseDocumentMetadata] = JournalDocumentMetadata
67
+ """The document metadata type to use for the search client."""
68
+ # embedding_provider
69
+
70
+
71
+ class JournalSearchQuery(SearchQuery):
72
+ """
73
+ Standardized search query structure, specifically for searching Logseq `JournalDocument`s.
74
+ At least 1 search criterion is required (text, keywords, metadata_filters), but multiple are allowed.
75
+ Types are the same as in `SearchQuery`.
76
+ Descriptions are customized to better suit Logseq `JournalDocument`'s.
77
+ """
78
+
79
+ text: str | None = None
80
+ """
81
+ String to match against using in a semantic search, i.e. using vector distance.
82
+ Instead of passing in a question, rephrase the question to be a string/phrase matching closer
83
+ to the content expected to be found.
84
+ """
85
+
86
+ keywords: list[str] = []
87
+ """
88
+ List of keywords to **exact-match**.
89
+ If any keywords are provided, at least 1 keyword must appear in the content,
90
+ so use only if certain that the word will appear.
91
+ Do not include keywords that can be covered in metadata_filters, e.g. dates, document type.
92
+ If you are not certain that a word will appear, try using `text` for a semantic search instead.
93
+ """
94
+
95
+ metadata_filters: list[MetadataFilter] = Field(
96
+ default=[],
97
+ json_schema_extra={
98
+ "metadata_schema": JournalDocumentMetadata.model_json_schema()
99
+ },
100
+ )
101
+ """
102
+ List of metadata conditions that must be matched.
103
+ Refer to `metadata_schema` for the expected schema, as it exists in the database.
104
+ """
105
+
106
+ limit: int = Field(20, ge=3)
107
+ """Maximum number of results to return."""
@@ -0,0 +1,19 @@
1
+ from logseq_retriever.retrievers.contextualizer import (
2
+ RetrieverContextualizerProps,
3
+ RetrieverContextualizer,
4
+ )
5
+ from logseq_retriever.retrievers.journal_retriever import LogseqJournalRetriever
6
+ from logseq_retriever.retrievers.journal_date_range_retriever import (
7
+ LogseqJournalDateRangeRetriever,
8
+ )
9
+ from logseq_retriever.retrievers.pgvector_journal_retriever import (
10
+ PGVectorJournalRetriever,
11
+ )
12
+
13
+ __all__ = [
14
+ "RetrieverContextualizerProps",
15
+ "RetrieverContextualizer",
16
+ "LogseqJournalRetriever",
17
+ "LogseqJournalDateRangeRetriever",
18
+ "PGVectorJournalRetriever",
19
+ ]