logseq-retriever 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,15 @@
1
+ from logseq_retriever.loaders.journal_document_metadata import (
2
+ LogseqJournalDocumentMetadata,
3
+ )
4
+ from logseq_retriever.loaders.journal_filesystem_loader import (
5
+ LogseqJournalFilesystemLoader,
6
+ )
7
+ from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
8
+ from logseq_retriever.loaders.journal_loader import LogseqJournalLoader
9
+
10
+ __all__ = [
11
+ "LogseqJournalDocumentMetadata",
12
+ "LogseqJournalFilesystemLoader",
13
+ "LogseqJournalLoaderInput",
14
+ "LogseqJournalLoader",
15
+ ]
@@ -0,0 +1,32 @@
1
+ from typing import Annotated
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class LogseqJournalDocumentMetadata(BaseModel):
7
+ """
8
+ Metadata for a Logseq journal `Document`.
9
+ """
10
+
11
+ journal_date: Annotated[
12
+ str,
13
+ Field(
14
+ description="The date of the journal entry, in YYYY-MM-DD format.",
15
+ examples=["2023-01-01", "2025-06-09"],
16
+ ),
17
+ ]
18
+
19
+ journal_tags: Annotated[
20
+ list[str],
21
+ Field(
22
+ description="The tags associated with the journal entry.",
23
+ examples=[["tag1", "tag2"], ["tag3"]],
24
+ ),
25
+ ]
26
+
27
+ journal_char_count: Annotated[
28
+ int,
29
+ Field(
30
+ description="The number of characters in the journal entry.",
31
+ ),
32
+ ]
@@ -0,0 +1,157 @@
1
+ from datetime import date, datetime
2
+ from logging import getLogger
3
+ from pathlib import Path
4
+
5
+ from langchain_core.documents import Document
6
+ from logseq_retriever.loaders.journal_loader import LogseqJournalLoader
7
+ from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
8
+ from logseq_retriever.loaders.journal_document_metadata import (
9
+ LogseqJournalDocumentMetadata,
10
+ )
11
+ import os
12
+
13
+
14
+ logger = getLogger(__name__)
15
+
16
+
17
+ class LogseqJournalFilesystemLoader(LogseqJournalLoader):
18
+ """
19
+ Based on input, load a collection of Logseq journal files from the filesystem, as
20
+ Langchain `Document`s.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ logseq_journal_path: str,
26
+ **kwargs,
27
+ ):
28
+ """
29
+ Initialize the loader with the path to the Logseq journal directory.
30
+ `logseq_journal_path` should be contain Logesq journal files, such as `2025_03_27.md`
31
+ """
32
+ self.logseq_journal_path = logseq_journal_path
33
+ self._validate_logseq_journal_path()
34
+
35
+ def load( # type: ignore[override]
36
+ self,
37
+ input: LogseqJournalLoaderInput,
38
+ ) -> list[Document]:
39
+ """
40
+ Synchronously load the documents from the Logseq journal directory, according to the input.
41
+ """
42
+ # Convert dates to datetime objects once
43
+ if input.start_date > input.end_date:
44
+ raise ValueError("journal_end_date must be after journal_start_date")
45
+
46
+ documents: list[Document] = []
47
+ # TODO this glob pattern can be improved by analyzing start_date & end_date to provide fewer matches
48
+ for path in Path(self.logseq_journal_path).glob("*.md"):
49
+ filename = path.name
50
+ if self._match_journal(filename, input.start_date, input.end_date):
51
+ file_path = os.path.join(self.logseq_journal_path, filename)
52
+ with open(file_path, "r") as file:
53
+ content = file.read()
54
+ documents.extend(
55
+ self.__class__.parse_journal_markdown_file(
56
+ content, filename, input.enable_splitting
57
+ )
58
+ )
59
+ return documents
60
+
61
+ def _validate_logseq_journal_path(self):
62
+ """
63
+ Validate the path to the Logseq journal directory. Check that the directory exists.
64
+ If the directory is empty, or does not contain files with the expected format, log a warning.
65
+ """
66
+ # verify that the path exist, and is a directory
67
+ if not os.path.exists(self.logseq_journal_path):
68
+ raise ValueError(
69
+ f"Logseq journal path does not exist: {self.logseq_journal_path}"
70
+ )
71
+ if not os.path.isdir(self.logseq_journal_path):
72
+ raise ValueError(
73
+ f"Logseq journal path is not a directory: {self.logseq_journal_path}"
74
+ )
75
+
76
+ # verify that the directory contains files with the expected format
77
+ files = os.listdir(self.logseq_journal_path)
78
+ if len(files) == 0:
79
+ logger.warning(
80
+ f"Logseq journal directory is empty: {self.logseq_journal_path}"
81
+ )
82
+ files = Path(self.logseq_journal_path).glob("*.md")
83
+ if not len(list(files)) > 0:
84
+ logger.warning(
85
+ f"No files with .md extension found in {self.logseq_journal_path}"
86
+ )
87
+
88
+ def _match_journal(self, filename: str, start_date: date, end_date: date) -> bool:
89
+ """
90
+ Return `True` if journal date is between `start_date` & `end_date`.
91
+
92
+ Args:
93
+ filename: The journal filename (e.g., "2025_03_27.md")
94
+ start_date: The start date as a datetime object
95
+ end_date: The end date as a datetime object
96
+
97
+ Returns:
98
+ bool: True if the file's date is within the range, False otherwise
99
+ """
100
+ if not filename.endswith(".md"):
101
+ return False
102
+
103
+ try:
104
+ # Convert filename to date object
105
+ file_date = datetime.strptime(filename[:-3], "%Y_%m_%d").date()
106
+ return start_date <= file_date <= end_date
107
+ except ValueError:
108
+ # If there's any issue parsing the date from filename, skip this file
109
+ return False
110
+
111
+ @staticmethod
112
+ def parse_journal_markdown_file(
113
+ content: str, filename: str, enable_splitting: bool = True
114
+ ) -> list[Document]:
115
+ """
116
+ Generate `Document`s from a file's contents. If necessary, split content into digestible
117
+ `Document`s, and attach metadata.
118
+ This function can potentially be augmented by calling Logseq APIs, rather than simply parsing markdown files.
119
+ """
120
+ sections = content.split("\n- ") if enable_splitting else [content]
121
+ docs = []
122
+ for section in sections:
123
+ if section_content := section.strip():
124
+ # Create a Document
125
+ # first, check that the content length (char count) is acceptable
126
+ # if longer than acceptable, then call recursively
127
+ # TODO: use self.p.max_char_count below instead
128
+ metadata = (
129
+ LogseqJournalFilesystemLoader.parse_journal_markdown_file_metadata(
130
+ section_content, filename
131
+ )
132
+ )
133
+ docs.append(
134
+ Document(
135
+ page_content=section_content, metadata=metadata.model_dump()
136
+ )
137
+ )
138
+ return docs
139
+
140
+ @staticmethod
141
+ def parse_journal_markdown_file_metadata(
142
+ section: str, filename: str
143
+ ) -> LogseqJournalDocumentMetadata:
144
+ """
145
+ Parse metadata from a journal markdown file. Return `LogseqMarkdownDocumentMetadata`.
146
+ This function can potentially be augmented by calling Logseq APIs, rather than simply parsing markdown files.
147
+ """
148
+ # Extract date from filename
149
+ date_str = filename.replace(".md", "").replace("_", "-")
150
+ char_count = len(section)
151
+
152
+ return LogseqJournalDocumentMetadata(
153
+ journal_date=date_str,
154
+ # TODO get tags from Document's contents
155
+ journal_tags=[],
156
+ journal_char_count=char_count,
157
+ )
@@ -0,0 +1,13 @@
1
+ from typing import Any
2
+
3
+ from langchain_core.document_loaders import BaseLoader
4
+ from langchain_core.documents import Document
5
+
6
+
7
+ class LogseqJournalLoader(BaseLoader):
8
+ """
9
+ Base class for loading Logseq journal files.
10
+ """
11
+
12
+ def load(self, input: Any) -> list[Document]: # type: ignore[override] # ty: ignore[invalid-method-override]
13
+ raise NotImplementedError("This method should be implemented by subclasses.")
@@ -0,0 +1,108 @@
1
+ from datetime import datetime, date
2
+ from typing import Annotated
3
+
4
+ from pydantic import (
5
+ BaseModel,
6
+ Field,
7
+ AfterValidator,
8
+ computed_field,
9
+ PrivateAttr,
10
+ model_validator,
11
+ )
12
+
13
+
14
+ def _validate_date_format(value: str) -> str:
15
+ """
16
+ Normalize date string to ISO format (YYYY-MM-DD).
17
+ Accepts dates like '2023-3-1' and converts to '2023-03-01'.
18
+ """
19
+ try:
20
+ _parse_date(value)
21
+ return value
22
+ except ValueError:
23
+ raise ValueError(
24
+ f"Invalid date: '{value}'. Expecting ISO-8601 format: YYYY-MM-DD"
25
+ )
26
+
27
+
28
+ def _parse_date(date_str: str) -> date:
29
+ """
30
+ Parse a date string into a datetime.date object.
31
+ """
32
+ return datetime.strptime(date_str, "%Y-%m-%d").date()
33
+
34
+
35
+ class LogseqJournalLoaderInput(BaseModel):
36
+ """
37
+ Input for a Logseq journal `Document` loader, to invoke a load.
38
+ """
39
+
40
+ journal_start_date: Annotated[
41
+ str,
42
+ Field(
43
+ description="The start date of the journal to load, in YYYY-MM-DD format.",
44
+ examples=["2023-01-01", "2025-06-09"],
45
+ ),
46
+ AfterValidator(_validate_date_format),
47
+ ]
48
+ journal_end_date: Annotated[
49
+ str,
50
+ Field(
51
+ description="The end date of the journal to load, in YYYY-MM-DD format.",
52
+ examples=["2023-01-01", "2025-06-09"],
53
+ ),
54
+ AfterValidator(_validate_date_format),
55
+ ]
56
+ max_char_length: Annotated[
57
+ int,
58
+ Field(
59
+ description="The maximum number of characters to include in a single `Document`.",
60
+ examples=[8196, 2000],
61
+ default=1024 * 8,
62
+ ),
63
+ ] = 1024 * 8
64
+ enable_splitting: Annotated[
65
+ bool,
66
+ Field(
67
+ description="Whether to split the journal file into multiple `Document`s.",
68
+ examples=[True, False],
69
+ default=True,
70
+ ),
71
+ ] = True
72
+
73
+ # Private attributes that won't be included in model_dump
74
+ _start_date: date = PrivateAttr()
75
+ _end_date: date = PrivateAttr()
76
+
77
+ @model_validator(mode="after")
78
+ def _parse_dates(self) -> "LogseqJournalLoaderInput":
79
+ """Parse date strings into date objects after validation."""
80
+ self._start_date = _parse_date(self.journal_start_date)
81
+ self._end_date = _parse_date(self.journal_end_date)
82
+ return self
83
+
84
+ @computed_field
85
+ @property
86
+ def start_date(self) -> date:
87
+ """Get `journal_start_date` as a date object."""
88
+ return self._start_date
89
+
90
+ @computed_field
91
+ @property
92
+ def end_date(self) -> date:
93
+ """Get `journal_end_date` as a date object."""
94
+ return self._end_date
95
+
96
+
97
+ # debugging only
98
+ if __name__ == "__main__":
99
+ from pprint import pprint
100
+
101
+ pprint(LogseqJournalLoaderInput.model_json_schema())
102
+
103
+ example = LogseqJournalLoaderInput(
104
+ journal_start_date="2023-01-01",
105
+ journal_end_date="2023-01-02",
106
+ max_char_length=1024 * 4,
107
+ )
108
+ print(example.model_dump())
@@ -0,0 +1,15 @@
1
+ from logseq_retriever.models.journal_pgvector import (
2
+ JournalDocument,
3
+ JournalCorpusMetadata,
4
+ JournalDocumentMetadata,
5
+ JournalSearchClientConfig,
6
+ JournalSearchQuery,
7
+ )
8
+
9
+ __all__ = [
10
+ "JournalDocument",
11
+ "JournalCorpusMetadata",
12
+ "JournalDocumentMetadata",
13
+ "JournalSearchClientConfig",
14
+ "JournalSearchQuery",
15
+ ]
@@ -0,0 +1,107 @@
1
+ from typing import Type
2
+
3
+ from pgvector.sqlalchemy import Vector
4
+ from pydantic import Field
5
+ from sqlalchemy import Column, String
6
+
7
+ from pgvector_template.core import (
8
+ BaseDocument,
9
+ BaseDocumentMetadata,
10
+ BaseSearchClientConfig,
11
+ )
12
+ from pgvector_template.models.search import (
13
+ SearchQuery,
14
+ MetadataFilter,
15
+ )
16
+
17
+
18
+ class JournalDocument(BaseDocument):
19
+ """
20
+ Each `Corpus` is the entire entry for a given date. A corpus may consist of 1 or more chunks of `Document`s.
21
+ Each `Corpus` has a set of metadata, and each `Document` chunk has all of those, plus more.
22
+ """
23
+
24
+ __abstract__ = False
25
+ __tablename__ = "logseq_journal"
26
+
27
+ corpus_id = Column(String(len("2025-06-09")), index=True)
28
+ """Length of ISO date string"""
29
+ embedding = Column(Vector(1024))
30
+ """Embedding vector"""
31
+
32
+
33
+ class JournalCorpusMetadata(BaseDocumentMetadata):
34
+ """Metadata schema for Logseq journal corpora. Consist of 1-or-more chunks, called `Document`s."""
35
+
36
+ # corpus
37
+ date_str: str = Field(
38
+ pattern=r"^\d{4}-\d{2}-\d{2}$",
39
+ description="Date in ISO format, e.g. `2025-04-20`",
40
+ )
41
+
42
+ # defaults
43
+ document_type: str = Field(default="logseq_journal")
44
+ schema_version: str = Field(default="2025-07-10")
45
+
46
+
47
+ class JournalDocumentMetadata(JournalCorpusMetadata):
48
+ """Metadata schema for Logseq journal `Document`s. 1-or-more `Document`s make up a corpus."""
49
+
50
+ # chunk/document
51
+ chunk_len: int = Field()
52
+ """Length of the content in characters"""
53
+ word_count: int | None = Field()
54
+ """Length of the content in words"""
55
+ references: list[str] = Field(default=[])
56
+ """List of references to other Logseq documents, or journal dates"""
57
+ anchor_ids: list[str] = Field(default=[])
58
+ """Blocks in the document can have UUID anchors, which are referenced elsewhere. This is a list of all present"""
59
+
60
+
61
+ class JournalSearchClientConfig(BaseSearchClientConfig):
62
+ """Configuration for the Logseq journal search client."""
63
+
64
+ document_cls: Type[BaseDocument] = JournalDocument
65
+ """The document type to use for the search client."""
66
+ document_metadata_cls: Type[BaseDocumentMetadata] = JournalDocumentMetadata
67
+ """The document metadata type to use for the search client."""
68
+ # embedding_provider
69
+
70
+
71
+ class JournalSearchQuery(SearchQuery):
72
+ """
73
+ Standardized search query structure, specifically for searching Logseq `JournalDocument`s.
74
+ At least 1 search criterion is required (text, keywords, metadata_filters), but multiple are allowed.
75
+ Types are the same as in `SearchQuery`.
76
+ Descriptions are customized to better suit Logseq `JournalDocument`'s.
77
+ """
78
+
79
+ text: str | None = None
80
+ """
81
+ String to match against using in a semantic search, i.e. using vector distance.
82
+ Instead of passing in a question, rephrase the question to be a string/phrase matching closer
83
+ to the content expected to be found.
84
+ """
85
+
86
+ keywords: list[str] = []
87
+ """
88
+ List of keywords to **exact-match**.
89
+ If any keywords are provided, at least 1 keyword must appear in the content,
90
+ so use only if certain that the word will appear.
91
+ Do not include keywords that can be covered in metadata_filters, e.g. dates, document type.
92
+ If you are not certain that a word will appear, try using `text` for a semantic search instead.
93
+ """
94
+
95
+ metadata_filters: list[MetadataFilter] = Field(
96
+ default=[],
97
+ json_schema_extra={
98
+ "metadata_schema": JournalDocumentMetadata.model_json_schema()
99
+ },
100
+ )
101
+ """
102
+ List of metadata conditions that must be matched.
103
+ Refer to `metadata_schema` for the expected schema, as it exists in the database.
104
+ """
105
+
106
+ limit: int = Field(20, ge=3)
107
+ """Maximum number of results to return."""
@@ -0,0 +1,19 @@
1
+ from logseq_retriever.retrievers.contextualizer import (
2
+ RetrieverContextualizerProps,
3
+ RetrieverContextualizer,
4
+ )
5
+ from logseq_retriever.retrievers.journal_retriever import LogseqJournalRetriever
6
+ from logseq_retriever.retrievers.journal_date_range_retriever import (
7
+ LogseqJournalDateRangeRetriever,
8
+ )
9
+ from logseq_retriever.retrievers.pgvector_journal_retriever import (
10
+ PGVectorJournalRetriever,
11
+ )
12
+
13
+ __all__ = [
14
+ "RetrieverContextualizerProps",
15
+ "RetrieverContextualizer",
16
+ "LogseqJournalRetriever",
17
+ "LogseqJournalDateRangeRetriever",
18
+ "PGVectorJournalRetriever",
19
+ ]
@@ -0,0 +1,172 @@
1
+ from datetime import datetime
2
+ from logging import getLogger
3
+ from textwrap import dedent
4
+ from typing import Annotated, Any, Optional, Type
5
+
6
+ from langchain_core.language_models import BaseLanguageModel
7
+ from langchain_core.runnables import Runnable
8
+ from langchain_core.prompts import PromptTemplate
9
+ from langchain_core.output_parsers import PydanticOutputParser
10
+ from pydantic import BaseModel, Field
11
+
12
+
13
+ logger = getLogger(__name__)
14
+
15
+
16
+ RETRIEVER_CONTEXTUALIZER_PROMPT_TEMPLATE = dedent(
17
+ """\
18
+ {prompt}
19
+
20
+ Latest user input: {user_input}
21
+
22
+ {format_instructions}
23
+ """
24
+ )
25
+ RETRIEVER_CONTEXTUALIZER_PROMPT_TEMPLATE_WITH_CHAT_HISTORY = dedent(
26
+ """\
27
+ Realtime context:
28
+ {realtime_context}
29
+
30
+ {prompt}
31
+
32
+ Chat History:
33
+ {chat_history}
34
+
35
+ Latest user input: {user_input}
36
+
37
+ {format_instructions}
38
+ """
39
+ )
40
+
41
+
42
+ class RetrieverContextualizerProps(BaseModel):
43
+ """
44
+ Contextualizers are a component within Langchain `Retriever`s, that transform a natural-language
45
+ input (and history) into an actionable query, which can in turn be used to fetch relevant
46
+ `Document`s to answer address the input. The actionable query output can be structured, or
47
+ simply another string that can be used to query a Vectorstore.
48
+
49
+ To do this, the core of the Contextualizer is an LLM. The `prompt` is used by the LLM to perform
50
+ the transformation task.
51
+ """
52
+
53
+ llm: Annotated[
54
+ BaseLanguageModel,
55
+ Field(
56
+ "The LLM that will be used to transform the input into an actionable query.",
57
+ ),
58
+ ]
59
+
60
+ prompt: Annotated[
61
+ str,
62
+ Field(
63
+ description="The prompt to use for the LLM to transform the input into an actionable query.",
64
+ examples=[
65
+ "Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n{chat_history}\nFollow Up Input: {user_input}\nStandalone question:"
66
+ ],
67
+ default="Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n{chat_history}\nFollow Up Input: {user_input}\nStandalone question:",
68
+ ),
69
+ ]
70
+
71
+ # TODO impl validation on this schema
72
+ output_schema: Annotated[
73
+ Optional[Type[BaseModel]],
74
+ Field(
75
+ description="(Optional) Structured output schema, as a Pydantic `BaseModel`. If provided, will be added to the end of the prompt.",
76
+ default=None,
77
+ ),
78
+ ] = None
79
+
80
+ enable_chat_history: Annotated[
81
+ bool,
82
+ Field(
83
+ description="Whether to enable chat history in the prompt.",
84
+ default=True,
85
+ ),
86
+ ] = True
87
+
88
+
89
+ class RetrieverContextualizer(Runnable):
90
+ """
91
+ A Runnable that transforms natural language input into an actionable query
92
+ for retrievers, based on the provided configuration.
93
+ """
94
+
95
+ def __init__(self, props: RetrieverContextualizerProps):
96
+ """Initialize with validated props."""
97
+ self.props = props
98
+ self.chain = self._generate_chain()
99
+ self._parser_type = self.parser._type
100
+ self._output_type = (
101
+ self.parser.OutputType
102
+ if not self.props.output_schema
103
+ else self.props.output_schema
104
+ )
105
+
106
+ def _generate_chain(self) -> Runnable:
107
+ """
108
+ Generate and return the appropriate chain based on props.
109
+
110
+ Returns:
111
+ A Runnable chain that processes inputs according to the configuration.
112
+ """
113
+ # If output schema is provided, use PydanticOutputParser
114
+ if self.props.output_schema:
115
+ self.parser = PydanticOutputParser(pydantic_object=self.props.output_schema)
116
+ # create a PromptTemplate with partials
117
+ self.prompt_template = PromptTemplate(
118
+ input_variables=(
119
+ ["chat_history", "user_input"]
120
+ if self.props.enable_chat_history
121
+ else ["user_input"]
122
+ ),
123
+ partial_variables={
124
+ "realtime_context": self._get_realtime_context(),
125
+ "prompt": self.props.prompt,
126
+ "format_instructions": self.parser.get_format_instructions(),
127
+ },
128
+ template=(
129
+ RETRIEVER_CONTEXTUALIZER_PROMPT_TEMPLATE_WITH_CHAT_HISTORY
130
+ if self.props.enable_chat_history
131
+ else RETRIEVER_CONTEXTUALIZER_PROMPT_TEMPLATE
132
+ ),
133
+ )
134
+
135
+ else:
136
+ # Otherwise, use the LLM and extract the string content
137
+ # This ensures we get a clean string output rather than an LLM result object
138
+ from langchain_core.output_parsers import StrOutputParser
139
+
140
+ self.parser = StrOutputParser()
141
+ self.prompt_template = PromptTemplate.from_template(self.props.prompt)
142
+
143
+ # can enable for debugging, will not fail
144
+ return (
145
+ self.prompt_template
146
+ | (lambda x: logger.debug(f"Contextualizer prompt: {x}") or x)
147
+ | self.props.llm
148
+ | (lambda x: logger.debug(f"Contextualizer LLM output: {x}") or x)
149
+ | self.parser
150
+ | (lambda x: logger.info(f"OutputParser output: {x}") or x)
151
+ )
152
+
153
+ def invoke(self, input: dict[str, Any], config=None, **kwargs) -> Any:
154
+ """
155
+ Process the input through the chain.
156
+
157
+ Args:
158
+ input: The input to process, typically containing 'question' and 'chat_history'.
159
+ config: Optional configuration for the chain.
160
+
161
+ Returns:
162
+ The processed output, either a string or a structured object based on the output_schema.
163
+ """
164
+ return self.chain.invoke(input, config=config, **kwargs)
165
+
166
+ @staticmethod
167
+ def _get_realtime_context() -> str:
168
+ """
169
+ Get a string representing the realtime context of the retriever, with info such as:
170
+ - current datetime
171
+ """
172
+ return f"Current datetime: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
@@ -0,0 +1,79 @@
1
+ from collections.abc import Sequence
2
+ from logging import getLogger
3
+
4
+ from langchain_core.documents import Document
5
+ from langchain_core.messages import BaseMessage
6
+ from logseq_retriever.loaders import LogseqJournalLoader
7
+ from logseq_retriever.retrievers.journal_retriever import LogseqJournalRetriever
8
+ from logseq_retriever.retrievers.contextualizer import RetrieverContextualizer
9
+ from logseq_retriever.loaders.journal_loader_input import LogseqJournalLoaderInput
10
+
11
+
12
+ logger = getLogger(__name__)
13
+
14
+
15
+ class LogseqJournalDateRangeRetriever(LogseqJournalRetriever):
16
+ """
17
+ A `Retriever` that retrieves documents from a Logseq journal within a specified date range.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ contextualizer: RetrieverContextualizer,
23
+ loader: LogseqJournalLoader,
24
+ verbose: bool = True,
25
+ ):
26
+ """
27
+ Initialize the `Retriever` with a contextualizer and a loader.
28
+
29
+ Args:
30
+ contextualizer (`RetrieverContextualizer`)
31
+ loader (`LogseqJournalLoader`)
32
+ """
33
+ super().__init__()
34
+
35
+ if not isinstance(contextualizer, RetrieverContextualizer):
36
+ raise TypeError(
37
+ "Contextualizer must be an instance of RetrieverContextualizer"
38
+ )
39
+ if contextualizer._output_type != LogseqJournalLoaderInput:
40
+ raise TypeError(
41
+ "Contextualizer output type must be LogseqJournalLoaderInput"
42
+ )
43
+ self._contextualizer = contextualizer
44
+
45
+ if not isinstance(loader, LogseqJournalLoader):
46
+ raise TypeError("Loader must be an instance of LogseqJournalLoader")
47
+ self._loader = loader
48
+ self._verbose = verbose
49
+
50
+ def _build_loader_input(
51
+ self,
52
+ query: str,
53
+ chat_history: Sequence[BaseMessage] = (),
54
+ ) -> LogseqJournalLoaderInput:
55
+ """
56
+ Based on the natural-language `query`, return an instance of `LogseqJournalLoaderInput`,
57
+ which can then be used to invoke the `LogseqJournalLoader`.
58
+ Use the `RetrieverContextualizer` to do this.
59
+ """
60
+ contextualizer_input = {
61
+ "chat_history": chat_history,
62
+ "user_input": query,
63
+ }
64
+ loader_input = self._contextualizer.invoke(contextualizer_input)
65
+ if self._verbose:
66
+ logger.info(f"Contextualizer output: {loader_input}")
67
+ if not isinstance(loader_input, LogseqJournalLoaderInput):
68
+ raise TypeError(
69
+ f"Expected LogseqJournalLoaderInput but got {type(loader_input).__name__}"
70
+ )
71
+ return loader_input
72
+
73
+ def _fetch_documents(
74
+ self, loader_input: LogseqJournalLoaderInput
75
+ ) -> list[Document]:
76
+ docs = self._loader.load(loader_input)
77
+ if self._verbose:
78
+ logger.info(f"Retrieved {len(docs)} documents")
79
+ return docs
@@ -0,0 +1,105 @@
1
+ from abc import abstractmethod
2
+ from collections.abc import Sequence
3
+ from logging import getLogger
4
+ from typing import Any
5
+
6
+ from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
7
+ from langchain_core.documents import Document
8
+ from langchain_core.exceptions import OutputParserException
9
+ from langchain_core.retrievers import BaseRetriever
10
+ from langchain_core.messages import BaseMessage
11
+ from pydantic import ValidationError
12
+
13
+
14
+ logger = getLogger(__name__)
15
+
16
+
17
+ class LogseqJournalRetriever(BaseRetriever):
18
+ """
19
+ A Langchain `Retriever` that is specifically for retrieving Logseq journal `Document`'s,
20
+ based on a natural-language query. This `Retriever` will, in turn, leverage a Loader or
21
+ Vectorstore to retrieve relevant documents to the query.
22
+ """
23
+
24
+ document_context: str = "These Documents represent journal entries. "
25
+
26
+ def retrieve(
27
+ self, query: str, chat_history: Sequence[BaseMessage] | None = None
28
+ ) -> list[Document]:
29
+ """
30
+ Directly retrieve documents for a query, bypassing LangChain's `invoke()` machinery.
31
+
32
+ Note: unlike `invoke()`, this method does not trigger LangSmith tracing or
33
+ registered callbacks. Use `invoke()` if those are needed.
34
+ """
35
+ return self._execute(query, chat_history or ())
36
+
37
+ def _get_relevant_documents(
38
+ self,
39
+ query: str | dict[str, Any],
40
+ *,
41
+ run_manager: CallbackManagerForRetrieverRun,
42
+ chat_history: Sequence[BaseMessage] | None = None,
43
+ ) -> list[Document]:
44
+ """
45
+ Called by `invoke`.
46
+
47
+ `query` can be provided as a `str` (a natural-language query), or as a dict where
48
+ `chat_history` can be provided additionally. Format:
49
+
50
+ ```python
51
+ query = {
52
+ "user_input": "user's latest question",
53
+ "chat_history": [("AiMessage", )]
54
+ }
55
+ ```
56
+
57
+ Returns potentially relevant `langchain_core.documents.Document`s to answer the query.
58
+ """
59
+ # Handle case where query is passed as a dictionary (e.g., {"user_input": "query", "chat_history": [...]})
60
+ if isinstance(query, dict):
61
+ actual_query = (
62
+ query.get("user_input") or query.get("input") or query.get("query", "")
63
+ )
64
+ chat_history = (
65
+ chat_history or query.get("chat_history") or query.get("history")
66
+ )
67
+ else:
68
+ actual_query = query
69
+
70
+ return self._execute(actual_query, chat_history or [])
71
+
72
+ def _execute(
73
+ self, query: str, chat_history: Sequence[BaseMessage]
74
+ ) -> list[Document]:
75
+ try:
76
+ loader_input = self._build_loader_input(query, chat_history)
77
+ except (TypeError, ValidationError, OutputParserException):
78
+ logger.exception("Error building loader input")
79
+ return []
80
+ return self._fetch_documents(loader_input)
81
+
82
+ @abstractmethod
83
+ def _fetch_documents(
84
+ self,
85
+ loader_input: Any,
86
+ ) -> list[Document]:
87
+ """
88
+ Subclasses shall impl this method.
89
+ Return a list of `langchain_core.documents.Document`s based on the user's query
90
+ (and chat_history if available).
91
+ """
92
+ raise NotImplementedError("This method shall be implemented by subclasses.")
93
+
94
+ @abstractmethod
95
+ def _build_loader_input(
96
+ self,
97
+ query: str,
98
+ chat_history: Sequence[BaseMessage] = (),
99
+ ) -> Any:
100
+ """
101
+ Subclasses shall impl this method.
102
+ Return a dataclass, based on the user's query and chat_history if available, which shall
103
+ be used in the subsequent step to load/query for relevant documents.
104
+ """
105
+ raise NotImplementedError("This method shall be implemented by subclasses.")
@@ -0,0 +1,95 @@
1
+ from collections.abc import Sequence
2
+ from logging import getLogger
3
+
4
+ from langchain_core.messages import BaseMessage
5
+ from langchain_core.documents import Document
6
+ from pgvector_template.core.search import SearchQuery
7
+ from pgvector_template.service import DocumentService
8
+
9
+ from logseq_retriever.retrievers.contextualizer import RetrieverContextualizer
10
+ from logseq_retriever.retrievers.journal_retriever import LogseqJournalRetriever
11
+ from logseq_retriever.models.journal_pgvector import JournalDocument
12
+
13
+
14
+ logger = getLogger(__name__)
15
+
16
+
17
+ class PGVectorJournalRetriever(LogseqJournalRetriever):
18
+ """
19
+ A `Retriever` that relies on a PGVector backend to fetch Logseq journals.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ contextualizer: RetrieverContextualizer,
25
+ document_service: DocumentService,
26
+ verbose: bool = True,
27
+ **kwargs,
28
+ ):
29
+ """
30
+ Initialize the `Retriever` with a contextualizer and a loader.
31
+ """
32
+ super().__init__()
33
+
34
+ if not isinstance(contextualizer, RetrieverContextualizer):
35
+ raise TypeError(
36
+ "contextualizer must be an instance of RetrieverContextualizer"
37
+ )
38
+ if not issubclass(contextualizer._output_type, SearchQuery):
39
+ raise TypeError(
40
+ "contextualizer._output_type must be SearchQuery or a subclass"
41
+ )
42
+ self._contextualizer = contextualizer
43
+
44
+ if not isinstance(document_service, DocumentService):
45
+ raise TypeError("document_service must be an instance of DocumentService")
46
+ self._document_service = document_service
47
+ self._verbose = verbose
48
+
49
+ def _build_loader_input(
50
+ self,
51
+ query: str,
52
+ chat_history: Sequence[BaseMessage] = (),
53
+ ) -> SearchQuery:
54
+ """
55
+ Based on the natural-language `query`, return an instance of `SearchQuery`,
56
+ which can then be used to invoke the `DocumentService.search_client.search`.
57
+ Use the `RetrieverContextualizer` to do this.
58
+ """
59
+ contextualizer_input = {
60
+ "chat_history": chat_history,
61
+ "user_input": query,
62
+ }
63
+ db_query = self._contextualizer.invoke(contextualizer_input)
64
+ if self._verbose:
65
+ logger.info(f"Contextualizer output: {db_query}")
66
+ if not isinstance(db_query, SearchQuery):
67
+ raise TypeError(
68
+ f"Expected SearchQuery or subclass but got {type(db_query).__name__}"
69
+ )
70
+ return db_query
71
+
72
+ def _fetch_documents(self, loader_input: SearchQuery) -> list[Document]:
73
+ """
74
+ Return a list of `langchain_core.documents.Document`s based on the user's query
75
+ (and chat_history if available).
76
+ `load_input` shall be an instance of `SearchQuery` or a subclass, in this context.
77
+ """
78
+ db_results = self._document_service.search_client.search(loader_input)
79
+ if self._verbose:
80
+ logger.info(f"Retrieved {len(db_results)} documents from PGVector.")
81
+ return [
82
+ self._build_langchain_document_from_pgvector_document(result.document)
83
+ for result in db_results
84
+ ]
85
+
86
+ def _build_langchain_document_from_pgvector_document(
87
+ self, pgvector_document: JournalDocument
88
+ ) -> Document:
89
+ """
90
+ Build a LangChain document from a PGVector document.
91
+ """
92
+ return Document(
93
+ page_content=str(pgvector_document.content),
94
+ metadata=pgvector_document.document_metadata,
95
+ )
File without changes
@@ -0,0 +1,9 @@
1
+ from logseq_retriever.uploaders.pgvector.journal_corpus_manager import (
2
+ JournalCorpusManagerConfig,
3
+ JournalCorpusManager,
4
+ )
5
+
6
+ __all__ = [
7
+ "JournalCorpusManagerConfig",
8
+ "JournalCorpusManager",
9
+ ]
@@ -0,0 +1,72 @@
1
+ import re
2
+ from typing import Any, Type
3
+
4
+ from pgvector_template.core import (
5
+ BaseCorpusManager,
6
+ BaseCorpusManagerConfig,
7
+ BaseDocument,
8
+ BaseDocumentMetadata,
9
+ )
10
+
11
+ from logseq_retriever.models.journal_pgvector import (
12
+ JournalDocument,
13
+ JournalDocumentMetadata,
14
+ )
15
+
16
+
17
+ class JournalCorpusManagerConfig(BaseCorpusManagerConfig):
18
+ """Configuration for Logseq journal `JournalCorpusManager`."""
19
+
20
+ schema_name: str = "logseq_journal"
21
+ """Name of the schema to use for the corpus manager"""
22
+ document_cls: Type[BaseDocument] = JournalDocument
23
+ """Class to use for the document model"""
24
+ document_metadata_cls: Type[BaseDocumentMetadata] = JournalDocumentMetadata
25
+ """Class to use for the document metadata model"""
26
+ # embedding_provider: BaseEmbeddingProvider # is still required
27
+
28
+
29
+ class JournalCorpusManager(BaseCorpusManager):
30
+ """
31
+ CorpusManager declaration for Logseq journals. Each `Corpus` is the entire entry for a given date.
32
+ """
33
+
34
+ def _split_corpus(self, content: str, **kwargs) -> list[str]:
35
+ """Split the journal file on root-level bullet points"""
36
+ split_content = content.split("\n-")
37
+ return [
38
+ cleaned_chunk
39
+ for chunk in split_content
40
+ if (cleaned_chunk := chunk.strip().removeprefix("-").removeprefix(" "))
41
+ ]
42
+
43
+ def _extract_chunk_metadata(self, content: str, **kwargs) -> dict[str, Any]:
44
+ """Extract metadata from chunk content"""
45
+ # Add some basic metadata about the chunk
46
+ split_content = content.split()
47
+ return {
48
+ "chunk_len": len(content),
49
+ "word_count": len(split_content),
50
+ "references": self._extract_chunk_references(split_content),
51
+ "anchor_ids": self._extract_anchor_ids(content),
52
+ }
53
+
54
+ def _extract_chunk_references(self, split_content: list[str]) -> list[str]:
55
+ """
56
+ Extract references to other Logseq corpora, including other journals.
57
+ Expected to start with `#`, e.g. `#2025-07-07`, `#cookout`.
58
+ Special chars `!?,:'"\\` break references. `\\` is ignored.
59
+ """
60
+ references = []
61
+ for word in split_content:
62
+ if word.startswith("#"):
63
+ ref = word.lstrip("#").rstrip("#").replace("\\", "")
64
+ for char in "!?,:'\"":
65
+ ref = ref.split(char)[0]
66
+ if ref:
67
+ references.append(ref)
68
+ return references
69
+
70
+ def _extract_anchor_ids(self, content: str) -> list[str]:
71
+ """Extract Logseq anchor IDs from content (id:: <uuid>)"""
72
+ return re.findall(r"id:: ([a-f0-9-]{36})", content)
@@ -0,0 +1,102 @@
1
+ Metadata-Version: 2.4
2
+ Name: logseq-retriever
3
+ Version: 0.4.1
4
+ Summary: Python library for loading and retrieving Logseq documents
5
+ Author-email: DL <v49t9zpqd@mozmail.com>
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.11
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: langchain<0.4.0,>=0.3.24
14
+ Requires-Dist: pydantic<3.0,>=2.11
15
+ Requires-Dist: pgvector-template>=0.3.4
16
+ Provides-Extra: scripts
17
+ Requires-Dist: python-dotenv; extra == "scripts"
18
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "scripts"
19
+ Requires-Dist: langchain-aws>=0.2.0; extra == "scripts"
20
+ Provides-Extra: test
21
+ Requires-Dist: pytest; extra == "test"
22
+ Requires-Dist: pytest-cov; extra == "test"
23
+ Requires-Dist: python-dotenv; extra == "test"
24
+ Requires-Dist: boto3; extra == "test"
25
+ Requires-Dist: boto3-stubs; extra == "test"
26
+ Requires-Dist: langchain-aws>=0.2.0; extra == "test"
27
+ Requires-Dist: psycopg[binary]>=3.0.0; extra == "test"
28
+ Requires-Dist: ty; extra == "test"
29
+ Requires-Dist: ruff; extra == "test"
30
+ Provides-Extra: dist
31
+ Requires-Dist: build>=1.2.2; extra == "dist"
32
+ Requires-Dist: twine>=6.1.0; extra == "dist"
33
+ Dynamic: license-file
34
+
35
+ # Logseq Retriever
36
+ Python library for loading and retrieving Logseq documents.
37
+
38
+ ---
39
+ ## Components
40
+ This section provides an overview of the components provided, listed by type
41
+
42
+
43
+ ### Retrievers
44
+ Retrievers inject context into a conversation. Works in tandem with a Contextualizer and `Document` Loader.
45
+ - **Input**:
46
+ - natural-language user-input, usually query-like
47
+ - (optional) chat history
48
+ - **Output**:
49
+ - list of `Document`s to provide context for an LLM to answer the user-input
50
+
51
+ #### Implementations
52
+ - `LogseqJournalDateRangeRetriever`
53
+ - retrieve Logseq journal `Document`s, intended for queries that require context from a date range
54
+ - required to set up:
55
+ - `RetrieverContextualizer`
56
+ - `LogseqJournalLoader`
57
+ - examples:
58
+ - "What did I do over Christmas break 2024?"
59
+ - "How did I spend the last Independence Day?"
60
+
61
+
62
+ ### Contextualizers
63
+ Contextualizers serve as the bridge between natural-language input and a downstream component that
64
+ handles fetching of relevant `Document`s.
65
+ - **Input**:
66
+ - natural-language user-input, usually query-like
67
+ - (optional) chat history
68
+ - **Output**:
69
+ - structured downstream query, based on
70
+
71
+ In this library, an instance of `RetrieverContextualizer` is provided directly to
72
+ `Retriever`s during the latter's instantiation. To set up the `RetrieverContextualizer`, provide
73
+ `RetrieverContextualizerProps`, which includes:
74
+ - `llm` - this is the backbone of the contextualizer
75
+ - `prompt` - instructions provided to the LLM
76
+ - `output_schema` - (optional) structured schema used to fetch relevant `Document`s
77
+ - if no schema is provided, a string shall be returned instead
78
+ - other flags and settings
79
+
80
+
81
+ ### Loaders
82
+ Loaders are one type of component that can fetch relevant `Document`s. Loaders are typically specific to
83
+ a corresponding `Retriever` component.
84
+ - **Input**:
85
+ - each loader specifies its own schema
86
+ - the Contextualizer is usually responsible for creating an instance of the query obj to act upon
87
+ - **Output**:
88
+ - `list[Document]`
89
+
90
+ #### Implementations
91
+ - `LogseqJournalFilesystemLoader`
92
+ - loads from the filesystem, where journal files are expected to be present at specified path
93
+
94
+
95
+ ---
96
+ ## Scripts
97
+
98
+ ### PGVector
99
+
100
+ #### `upload_journal`
101
+
102
+ usage: `python scripts/upload_journal_to_pgvector.py [-h] [-p PATH] from_date to_date`
@@ -0,0 +1,21 @@
1
+ logseq_retriever/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ logseq_retriever/loaders/__init__.py,sha256=L5qqUEjx9A5sWdzHG-4c04z_KTxPnihJDh15gVmtWcs,507
3
+ logseq_retriever/loaders/journal_document_metadata.py,sha256=q5XDHX95gnOePP9v-Fwv79sYBrqP8p9Qh4ZyFoddPRk,744
4
+ logseq_retriever/loaders/journal_filesystem_loader.py,sha256=w0Z5Z8YLe8TLWTKbakRTPJ7dRr9qUMJZUFik9IHb9RE,6244
5
+ logseq_retriever/loaders/journal_loader.py,sha256=9e2bK_LDrTYHlUlHamD7MgGzFq8DBCAiBVweXAUtnzM,433
6
+ logseq_retriever/loaders/journal_loader_input.py,sha256=A88-NueZs6h9fNRs5Vm6Y00PgRZImYji2p6Wum-2edA,2963
7
+ logseq_retriever/models/__init__.py,sha256=nkbyo3WbzF4Z4XQAM-IT_SDNFytRo6UGEc_L84j4K-o,346
8
+ logseq_retriever/models/journal_pgvector.py,sha256=wtn2SM_GKqK45QZnTH8tEoDlBPWOlGfcRWGl65YvcF8,3759
9
+ logseq_retriever/retrievers/__init__.py,sha256=6MOLyW01yHK_AYpi_L7fKD9nHhJ6XYds_pGi5bAiQBM,597
10
+ logseq_retriever/retrievers/contextualizer.py,sha256=61YYXVanrHdgL_96bRMtob7Rzs0LuhEJpVb6lOwo__A,6141
11
+ logseq_retriever/retrievers/journal_date_range_retriever.py,sha256=dB0jDY5JaA_YjZf2G1tGC2uy_e0gVV0PSX61rb7pe4E,2850
12
+ logseq_retriever/retrievers/journal_retriever.py,sha256=e7MK2GRVDqo56B-xLEpF0HElZxNWVm50O1zrLx7uLio,3768
13
+ logseq_retriever/retrievers/pgvector_journal_retriever.py,sha256=vYcnicWLdqBIgSgjb_xnNMKvrSyCDGKz6rmGwr9RTdw,3525
14
+ logseq_retriever/uploaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ logseq_retriever/uploaders/pgvector/__init__.py,sha256=Y5wQzHtrEXaUwP05JZ8RqP5en_udUUV-f8ptbGl4MQE,210
16
+ logseq_retriever/uploaders/pgvector/journal_corpus_manager.py,sha256=7xCV5cgI4mJHPxTG92WMH8ylXOjB0PP7obDKwXxNuOU,2655
17
+ logseq_retriever-0.4.1.dist-info/licenses/LICENSE,sha256=4chADZoF7TXixgJtj6FYx2PiAjCMreSUMHevGcgdSG4,1069
18
+ logseq_retriever-0.4.1.dist-info/METADATA,sha256=ZwfjrzO5REJpb_cJ4Qh_rg8zXL_6DqOMd5waNBEgeUU,3563
19
+ logseq_retriever-0.4.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
+ logseq_retriever-0.4.1.dist-info/top_level.txt,sha256=1WZc9D05rwFgELxnbrEddvzcscLyUXD1ODwLpVx0r90,17
21
+ logseq_retriever-0.4.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 David Ge Liu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ logseq_retriever