obsitex 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- obsitex/__init__.py +1 -0
- obsitex/cli.py +102 -0
- obsitex/constants.py +41 -0
- obsitex/parser/__init__.py +211 -0
- obsitex/parser/blocks.py +501 -0
- obsitex/parser/formatting.py +100 -0
- obsitex/planner/__init__.py +259 -0
- obsitex/planner/jobs.py +38 -0
- obsitex/planner/links.py +36 -0
- obsitex/utils.py +19 -0
- obsitex-0.0.0.dist-info/METADATA +12 -0
- obsitex-0.0.0.dist-info/RECORD +15 -0
- obsitex-0.0.0.dist-info/WHEEL +5 -0
- obsitex-0.0.0.dist-info/entry_points.txt +2 -0
- obsitex-0.0.0.dist-info/top_level.txt +1 -0
obsitex/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
from obsitex.parser import ObsidianParser
|
obsitex/cli.py
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
import argparse
|
2
|
+
import logging
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
from obsitex import ObsidianParser
|
6
|
+
from obsitex.constants import DEFAULT_JINJA2_MAIN_TEMPLATE
|
7
|
+
|
8
|
+
|
9
|
+
def main():
|
10
|
+
parser = argparse.ArgumentParser(description="Convert Obsidian notes to LaTeX")
|
11
|
+
|
12
|
+
# Defines the inputs
|
13
|
+
parser.add_argument(
|
14
|
+
"--input",
|
15
|
+
"-i",
|
16
|
+
type=Path,
|
17
|
+
help="Path to the input file or folder containing the Obsidian notes.",
|
18
|
+
required=True,
|
19
|
+
)
|
20
|
+
|
21
|
+
parser.add_argument(
|
22
|
+
"--bibtex",
|
23
|
+
"-b",
|
24
|
+
type=Path,
|
25
|
+
help="Path to the BibTeX database file with all references.",
|
26
|
+
)
|
27
|
+
parser.add_argument(
|
28
|
+
"--graphics",
|
29
|
+
"-g",
|
30
|
+
type=Path,
|
31
|
+
help="Path to the graphics folder, where all images are assumed to be stored.",
|
32
|
+
)
|
33
|
+
parser.add_argument(
|
34
|
+
"--template",
|
35
|
+
"-t",
|
36
|
+
type=Path,
|
37
|
+
help="Path to the Jinja2 LaTeX template, won't use template if not provided.",
|
38
|
+
)
|
39
|
+
|
40
|
+
# Defines the outputs
|
41
|
+
parser.add_argument(
|
42
|
+
"--main-tex",
|
43
|
+
"-mt",
|
44
|
+
type=Path,
|
45
|
+
help="Path to the LaTeX file that will be generated, containing all compiled LaTeX.",
|
46
|
+
required=True,
|
47
|
+
)
|
48
|
+
parser.add_argument(
|
49
|
+
"--main-bibtex",
|
50
|
+
"-mb",
|
51
|
+
type=Path,
|
52
|
+
help="Path to the BibTeX file that will be generated, containing the references - only generated if citations are used.",
|
53
|
+
)
|
54
|
+
|
55
|
+
# Administrative options
|
56
|
+
parser.add_argument(
|
57
|
+
"--debug",
|
58
|
+
"-d",
|
59
|
+
action="store_true",
|
60
|
+
help="Enable debug mode, which will print additional information by enabling logging.",
|
61
|
+
)
|
62
|
+
|
63
|
+
args = parser.parse_args()
|
64
|
+
|
65
|
+
if args.debug:
|
66
|
+
logging.basicConfig(level=logging.DEBUG)
|
67
|
+
|
68
|
+
if not args.input.exists():
|
69
|
+
raise FileNotFoundError(f"Input path {args.input} does not exist.")
|
70
|
+
|
71
|
+
# Read the template if it exists
|
72
|
+
if args.template is not None and args.template.is_file():
|
73
|
+
with open(args.template, "r") as file:
|
74
|
+
template = file.read()
|
75
|
+
logging.info(f"Using template from {args.template}.")
|
76
|
+
else:
|
77
|
+
template = DEFAULT_JINJA2_MAIN_TEMPLATE
|
78
|
+
logging.info("No template provided, using default template.")
|
79
|
+
|
80
|
+
# Create the parser
|
81
|
+
parser = ObsidianParser(
|
82
|
+
graphics_folder=args.graphics,
|
83
|
+
main_template=template,
|
84
|
+
bibtex_database_path=args.bibtex,
|
85
|
+
out_bitex_path=args.main_bibtex,
|
86
|
+
)
|
87
|
+
|
88
|
+
if args.input.is_dir():
|
89
|
+
parser.add_dir(args.input)
|
90
|
+
elif args.input.is_file():
|
91
|
+
parser.add_file(args.input)
|
92
|
+
else:
|
93
|
+
raise ValueError(f"Invalid path: {args.input}")
|
94
|
+
|
95
|
+
with open(args.main_tex, "w") as file:
|
96
|
+
file.write(parser.to_latex())
|
97
|
+
|
98
|
+
print(f"Output written to {args.main_tex}")
|
99
|
+
|
100
|
+
|
101
|
+
if __name__ == "__main__":
|
102
|
+
main()
|
obsitex/constants.py
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
DEFAULT_JINJA2_JOB_TEMPLATE = "{{ parsed_latex_content }}"
|
2
|
+
DEFAULT_JINJA2_MAIN_TEMPLATE = """
|
3
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
4
|
+
%% This file was automatically generated by obsitex.
|
5
|
+
%% A tool to convert Obsidian markdown files to LaTeX.
|
6
|
+
%% https://github.com/ruipreis/obsitex
|
7
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
8
|
+
|
9
|
+
{{ parsed_latex_content }}
|
10
|
+
|
11
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
12
|
+
%% End of generated file
|
13
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
14
|
+
"""
|
15
|
+
|
16
|
+
DEFAULT_HLEVEL_MAPPING = {
|
17
|
+
-2: "part",
|
18
|
+
-1: "chapter",
|
19
|
+
0: "section",
|
20
|
+
1: "subsection",
|
21
|
+
2: "subsubsection",
|
22
|
+
3: "paragraph",
|
23
|
+
}
|
24
|
+
|
25
|
+
# How markers are placed in parsed latex
|
26
|
+
DEFAULT_APPENDIX_MARKER = """
|
27
|
+
\\appendix
|
28
|
+
"""
|
29
|
+
|
30
|
+
DEFAULT_BIBLIOGRAPHY_MARKER = """
|
31
|
+
\\bibliography{main}
|
32
|
+
"""
|
33
|
+
|
34
|
+
SPECIAL_CALLOUTS = [
|
35
|
+
"[!figure]",
|
36
|
+
"[!table]",
|
37
|
+
"[!chart]",
|
38
|
+
]
|
39
|
+
|
40
|
+
QUOTE_MARKER = "> "
|
41
|
+
CALLOUT_CONFIG_MARKER = "%%"
|
@@ -0,0 +1,211 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional, Sequence
|
4
|
+
|
5
|
+
import bibtexparser
|
6
|
+
from jinja2 import Environment
|
7
|
+
|
8
|
+
from obsitex.constants import (
|
9
|
+
DEFAULT_APPENDIX_MARKER,
|
10
|
+
DEFAULT_BIBLIOGRAPHY_MARKER,
|
11
|
+
DEFAULT_HLEVEL_MAPPING,
|
12
|
+
DEFAULT_JINJA2_JOB_TEMPLATE,
|
13
|
+
DEFAULT_JINJA2_MAIN_TEMPLATE,
|
14
|
+
)
|
15
|
+
from obsitex.parser.blocks import (
|
16
|
+
PARSEABLE_BLOCKS,
|
17
|
+
LaTeXBlock,
|
18
|
+
MarkerBlock,
|
19
|
+
Paragraph,
|
20
|
+
Section,
|
21
|
+
)
|
22
|
+
from obsitex.planner import ExecutionPlan
|
23
|
+
from obsitex.planner.jobs import AddBibliography, AddHeader, AddText, PlannedJob
|
24
|
+
|
25
|
+
# Increase logging level to bibtexparser - avoid warnings
|
26
|
+
logging.getLogger("bibtexparser").setLevel(logging.ERROR)
|
27
|
+
|
28
|
+
|
29
|
+
class ObsidianParser:
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
bibtex_database_path: Optional[Path] = None,
|
33
|
+
implictly_add_bibtex: bool = True,
|
34
|
+
out_bitex_path: Optional[Path] = None,
|
35
|
+
graphics_folder: Optional[Path] = None,
|
36
|
+
job_template: str = DEFAULT_JINJA2_JOB_TEMPLATE,
|
37
|
+
main_template: str = DEFAULT_JINJA2_MAIN_TEMPLATE,
|
38
|
+
hlevel_mapping: dict = DEFAULT_HLEVEL_MAPPING,
|
39
|
+
appendix_marker: str = DEFAULT_APPENDIX_MARKER,
|
40
|
+
bibliography_marker: str = DEFAULT_BIBLIOGRAPHY_MARKER,
|
41
|
+
base_hlevel: int = 0,
|
42
|
+
):
|
43
|
+
self.job_template = job_template
|
44
|
+
self.main_template = main_template
|
45
|
+
self.hlevel_mapping = hlevel_mapping
|
46
|
+
self.appendix_marker = appendix_marker
|
47
|
+
self.bibliography_marker = bibliography_marker
|
48
|
+
self.out_bitex_path = out_bitex_path
|
49
|
+
|
50
|
+
# Construct an execution plan, which will collect the jobs to run from
|
51
|
+
# the files and pths provided
|
52
|
+
self.execution_plan = ExecutionPlan(
|
53
|
+
bibtex_database_path=bibtex_database_path,
|
54
|
+
implictly_add_bibtex=implictly_add_bibtex,
|
55
|
+
)
|
56
|
+
|
57
|
+
# Extra arguments that should be injected when converting to latex
|
58
|
+
self.extra_args = {
|
59
|
+
"hlevel_mapping": self.hlevel_mapping,
|
60
|
+
"graphics_folder": graphics_folder,
|
61
|
+
}
|
62
|
+
|
63
|
+
# Flag to continuously check if in appendix
|
64
|
+
self.in_appendix = False
|
65
|
+
|
66
|
+
# Set of blocks that will be added to the main tex file
|
67
|
+
self.blocks: Sequence[LaTeXBlock] = []
|
68
|
+
|
69
|
+
# Keep track of the latest header level
|
70
|
+
self.base_hlevel = base_hlevel
|
71
|
+
self.latest_parsed_hlevel = base_hlevel
|
72
|
+
|
73
|
+
def add_file(self, file_path: Path, adjust_hlevel: bool = True):
|
74
|
+
# By default adding a file assumes a single file structure
|
75
|
+
if adjust_hlevel:
|
76
|
+
self.latest_parsed_hlevel = self.base_hlevel - 1
|
77
|
+
|
78
|
+
self.execution_plan.add_file(file_path)
|
79
|
+
|
80
|
+
def add_dir(self, dir_path: Path):
|
81
|
+
self.execution_plan.add_dir(dir_path)
|
82
|
+
|
83
|
+
def apply_jobs(self):
|
84
|
+
for job in self.execution_plan.iter_jobs():
|
85
|
+
self.parse_job(job)
|
86
|
+
|
87
|
+
def to_latex(self) -> str:
|
88
|
+
# Reset the parser blocks and apply
|
89
|
+
self.blocks = []
|
90
|
+
self.apply_jobs()
|
91
|
+
|
92
|
+
# Create template for job level and main
|
93
|
+
job_template = Environment().from_string(self.job_template)
|
94
|
+
main_template = Environment().from_string(self.main_template)
|
95
|
+
|
96
|
+
# Render each block onto the job template
|
97
|
+
rendered_blocks = "\n\n".join(
|
98
|
+
[
|
99
|
+
job_template.render(
|
100
|
+
parsed_latex_content=block.formatted_text(**self.extra_args),
|
101
|
+
**block.metadata,
|
102
|
+
)
|
103
|
+
for block in self.blocks
|
104
|
+
]
|
105
|
+
)
|
106
|
+
|
107
|
+
# Render the main template with the rendered blocks
|
108
|
+
# the global variables are shared by all blocks, we use the first
|
109
|
+
# block for simplicity
|
110
|
+
if len(self.blocks) > 0:
|
111
|
+
global_configs = self.blocks[0].metadata
|
112
|
+
else:
|
113
|
+
global_configs = {}
|
114
|
+
|
115
|
+
return main_template.render(
|
116
|
+
parsed_latex_content=rendered_blocks,
|
117
|
+
**global_configs,
|
118
|
+
)
|
119
|
+
|
120
|
+
def parse_job(self, job: PlannedJob) -> str:
|
121
|
+
if not self.in_appendix:
|
122
|
+
self.in_appendix = job.is_in_appendix
|
123
|
+
|
124
|
+
# If in appendix, add the appendix marker
|
125
|
+
if self.in_appendix:
|
126
|
+
marker_block = MarkerBlock(self.appendix_marker)
|
127
|
+
marker_block.metadata = job.configs
|
128
|
+
self.blocks.append(marker_block)
|
129
|
+
logging.info("Added appendix marker to the parser.")
|
130
|
+
|
131
|
+
# Given a job, returns the corresponding latex code
|
132
|
+
if isinstance(job, AddHeader):
|
133
|
+
self.latest_parsed_hlevel = job.level
|
134
|
+
return self._parse_header(job)
|
135
|
+
elif isinstance(job, AddText):
|
136
|
+
return self._parse_text(job)
|
137
|
+
elif isinstance(job, AddBibliography):
|
138
|
+
return self._parse_bibliography(job)
|
139
|
+
else:
|
140
|
+
raise ValueError(f"Unknown job type {job}")
|
141
|
+
|
142
|
+
def _parse_header(self, job: AddHeader):
|
143
|
+
section_block = Section(job.level, job.header)
|
144
|
+
self.blocks.append(section_block)
|
145
|
+
logging.info(
|
146
|
+
f'Added header "{job.header}" with level {job.level} to the parser.'
|
147
|
+
)
|
148
|
+
|
149
|
+
def _parse_text(self, job: AddText):
|
150
|
+
lines = job.text.split("\n")
|
151
|
+
curr_i = 0
|
152
|
+
initial_block_count = len(self.blocks)
|
153
|
+
|
154
|
+
while curr_i < len(lines):
|
155
|
+
found_block = False
|
156
|
+
|
157
|
+
for block_class in PARSEABLE_BLOCKS:
|
158
|
+
block_instance = block_class.detect_block(lines, curr_i)
|
159
|
+
|
160
|
+
if block_instance is not None:
|
161
|
+
block, curr_i = block_instance
|
162
|
+
|
163
|
+
if isinstance(block, Section):
|
164
|
+
block.hlevel += self.latest_parsed_hlevel
|
165
|
+
|
166
|
+
found_block = True
|
167
|
+
block.metadata = job.configs
|
168
|
+
self.blocks.append(block)
|
169
|
+
break
|
170
|
+
|
171
|
+
if not found_block:
|
172
|
+
# If remaining, assume it's a paragraph
|
173
|
+
paragraph_block = Paragraph(lines[curr_i])
|
174
|
+
paragraph_block.metadata = job.configs
|
175
|
+
self.blocks.append(paragraph_block)
|
176
|
+
|
177
|
+
curr_i += 1
|
178
|
+
|
179
|
+
logging.info(
|
180
|
+
f"Added {len(self.blocks) - initial_block_count} blocks to the parser, total {len(self.blocks)}."
|
181
|
+
)
|
182
|
+
|
183
|
+
def _parse_bibliography(self, job: AddBibliography):
|
184
|
+
if self.out_bitex_path is None:
|
185
|
+
raise ValueError("Bibliography was added but no output path was set.")
|
186
|
+
|
187
|
+
# Select the keys to be included in the bibliography, and export
|
188
|
+
with open(job.bibtex_path, "r") as file:
|
189
|
+
bib_database = bibtexparser.load(file)
|
190
|
+
|
191
|
+
# Index the bib tex keys and verify if all are present
|
192
|
+
bib_keys = {entry["ID"]: entry for entry in bib_database.entries}
|
193
|
+
missing_keys = [key for key in job.citations if key not in bib_keys]
|
194
|
+
|
195
|
+
if len(missing_keys) > 0:
|
196
|
+
raise ValueError(
|
197
|
+
f"Missing {len(missing_keys)} keys in bibliography: {missing_keys}"
|
198
|
+
)
|
199
|
+
|
200
|
+
# Write the selected entries to a new BibTeX file
|
201
|
+
new_db = bibtexparser.bparser.BibTexParser() # Get a new BibDatabase instance
|
202
|
+
new_db.entries = [bib_keys[key] for key in job.citations]
|
203
|
+
|
204
|
+
with open(self.out_bitex_path, "w") as file:
|
205
|
+
bibtexparser.dump(new_db, file)
|
206
|
+
|
207
|
+
# Add the proper marker
|
208
|
+
marker_block = MarkerBlock(self.bibliography_marker)
|
209
|
+
marker_block.metadata = job.configs
|
210
|
+
self.blocks.append(marker_block)
|
211
|
+
logging.info("Added bibliography marker to the parser.")
|
obsitex/parser/blocks.py
ADDED
@@ -0,0 +1,501 @@
|
|
1
|
+
import re
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
from io import StringIO
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Optional, Sequence, Tuple, Type
|
6
|
+
|
7
|
+
import yaml
|
8
|
+
|
9
|
+
from obsitex.constants import CALLOUT_CONFIG_MARKER, QUOTE_MARKER, SPECIAL_CALLOUTS
|
10
|
+
from obsitex.parser.formatting import detect_command, find_next_index, format_text
|
11
|
+
|
12
|
+
|
13
|
+
class LaTeXBlock(ABC):
|
14
|
+
def __init__(self, content, in_latex=False):
|
15
|
+
self.content = content
|
16
|
+
self.parent = None # Only Section and Project objects can be parents
|
17
|
+
self.in_latex = in_latex
|
18
|
+
self._is_after_appendix = False
|
19
|
+
self.metadata = {}
|
20
|
+
|
21
|
+
@property
|
22
|
+
def is_after_appendix(self):
|
23
|
+
return self._is_after_appendix
|
24
|
+
|
25
|
+
@is_after_appendix.setter
|
26
|
+
def is_after_appendix(self, value):
|
27
|
+
self._is_after_appendix = value
|
28
|
+
|
29
|
+
def formatted_text(self, **kwargs):
|
30
|
+
if self.in_latex:
|
31
|
+
if isinstance(self.content, str):
|
32
|
+
return self.content
|
33
|
+
else:
|
34
|
+
return "\n".join(self.content)
|
35
|
+
else:
|
36
|
+
if isinstance(self.content, str):
|
37
|
+
text_lines = format_text([self.content])
|
38
|
+
else:
|
39
|
+
text_lines = format_text(self.content)
|
40
|
+
|
41
|
+
return "\n".join(text_lines)
|
42
|
+
|
43
|
+
@staticmethod
|
44
|
+
@abstractmethod
|
45
|
+
def detect_block(
|
46
|
+
lines: Sequence[str], index: int
|
47
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
48
|
+
"""
|
49
|
+
Detects if the block is present in the lines starting from the index.
|
50
|
+
|
51
|
+
Returns the index of the last line of the block if the block is detected.
|
52
|
+
"""
|
53
|
+
pass
|
54
|
+
|
55
|
+
|
56
|
+
class Paragraph(LaTeXBlock):
|
57
|
+
def __init__(self, content):
|
58
|
+
super().__init__(content, in_latex=False)
|
59
|
+
|
60
|
+
@staticmethod
|
61
|
+
def detect_block(lines, index):
|
62
|
+
return None
|
63
|
+
|
64
|
+
|
65
|
+
class MarkerBlock(LaTeXBlock):
|
66
|
+
def __init__(self, content):
|
67
|
+
super().__init__(content, in_latex=True)
|
68
|
+
|
69
|
+
# Used to inject latex code for flow control, e.g.: appendix, bibliography
|
70
|
+
@staticmethod
|
71
|
+
def detect_block(lines, index):
|
72
|
+
return None
|
73
|
+
|
74
|
+
|
75
|
+
class Section(LaTeXBlock):
|
76
|
+
def __init__(self, hlevel: int, title: str):
|
77
|
+
super().__init__(None)
|
78
|
+
self.hlevel = hlevel
|
79
|
+
self.title = title
|
80
|
+
|
81
|
+
@property
|
82
|
+
def label(self):
|
83
|
+
reformatted_title = re.sub(r"\W", "_", self.title)
|
84
|
+
return f"sec:{reformatted_title}"
|
85
|
+
|
86
|
+
def __repr__(self):
|
87
|
+
return f'Section(hlevel={self.hlevel}, title="{self.title}")'
|
88
|
+
|
89
|
+
def formatted_text(self, **kwargs):
|
90
|
+
if "hlevel_mapping" not in kwargs:
|
91
|
+
raise ValueError("hlevel_mapping not provided in kwargs")
|
92
|
+
else:
|
93
|
+
hlevel_mapping = kwargs["hlevel_mapping"]
|
94
|
+
|
95
|
+
if self.hlevel not in hlevel_mapping:
|
96
|
+
raise ValueError(f"Header level {self.hlevel} not found in hlevel_mapping")
|
97
|
+
|
98
|
+
content = (
|
99
|
+
f"\\{hlevel_mapping[self.hlevel]}{{{self.title}}}\\label{{{self.label}}}"
|
100
|
+
)
|
101
|
+
|
102
|
+
return content
|
103
|
+
|
104
|
+
@staticmethod
|
105
|
+
def detect_block(
|
106
|
+
lines: Sequence[str], index: int
|
107
|
+
) -> Optional[Tuple["Section", int]]:
|
108
|
+
header_match = re.match(r"^(#+)\s*(.+)\s*", lines[index])
|
109
|
+
|
110
|
+
if header_match is not None:
|
111
|
+
hlevel = len(header_match.group(1))
|
112
|
+
title = header_match.group(2)
|
113
|
+
|
114
|
+
return Section(hlevel, title), index
|
115
|
+
|
116
|
+
return None
|
117
|
+
|
118
|
+
|
119
|
+
class Equation(LaTeXBlock):
|
120
|
+
def __init__(self, content, label: Optional[str] = None):
|
121
|
+
super().__init__(content)
|
122
|
+
self.label = label
|
123
|
+
|
124
|
+
def formatted_text(self, **kwargs):
|
125
|
+
equation_text = "\\begin{equation}\n"
|
126
|
+
|
127
|
+
if self.label is not None:
|
128
|
+
equation_text += f"\t\\label{{{self.label}}}\n"
|
129
|
+
|
130
|
+
equation_text += f"\t{self.content}\n\\end{{equation}}\n"
|
131
|
+
|
132
|
+
return equation_text
|
133
|
+
|
134
|
+
def __repr__(self):
|
135
|
+
return f"Equation(content={self.content}, label={self.label})"
|
136
|
+
|
137
|
+
@staticmethod
|
138
|
+
def detect_block(
|
139
|
+
lines: Sequence[str], index: int
|
140
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
141
|
+
def _is_equation(line):
|
142
|
+
return line.startswith("$$")
|
143
|
+
|
144
|
+
if _is_equation(lines[index]):
|
145
|
+
# Get the label if it exists
|
146
|
+
label = detect_command(lines[index])
|
147
|
+
|
148
|
+
# Find the end of the equation
|
149
|
+
end_index = find_next_index(lines, _is_equation, index + 1)
|
150
|
+
|
151
|
+
# Extract the equation content
|
152
|
+
equation_content = "\n".join(lines[index + 1 : end_index])
|
153
|
+
|
154
|
+
return Equation(equation_content, label), end_index
|
155
|
+
|
156
|
+
return None
|
157
|
+
|
158
|
+
|
159
|
+
class AbstractList(LaTeXBlock):
|
160
|
+
def __init__(self, lines: Sequence[str]):
|
161
|
+
super().__init__(lines)
|
162
|
+
self.lines = lines
|
163
|
+
|
164
|
+
@abstractmethod
|
165
|
+
def list_type(self):
|
166
|
+
pass
|
167
|
+
|
168
|
+
def formatted_text(self, **kwargs):
|
169
|
+
list_type = self.list_type()
|
170
|
+
content = f"\\begin{{{list_type}}}\n"
|
171
|
+
|
172
|
+
for line in format_text(self.lines):
|
173
|
+
content += f"\t\\item {line}\n"
|
174
|
+
|
175
|
+
content += f"\\end{{{list_type}}}\n"
|
176
|
+
|
177
|
+
return content
|
178
|
+
|
179
|
+
def __repr__(self):
|
180
|
+
return f"{self.list_type()} list (lines={self.lines})"
|
181
|
+
|
182
|
+
@staticmethod
|
183
|
+
def detect_block(
|
184
|
+
lines: Sequence[str],
|
185
|
+
index: int,
|
186
|
+
item_regex_pattern: str,
|
187
|
+
instance_class: Type["LaTeXBlock"],
|
188
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
189
|
+
regex_pattern = item_regex_pattern
|
190
|
+
|
191
|
+
def _is_list_item(line):
|
192
|
+
return re.match(regex_pattern, line)
|
193
|
+
|
194
|
+
not_is_list_item = lambda line: not _is_list_item(line)
|
195
|
+
|
196
|
+
if _is_list_item(lines[index]):
|
197
|
+
# Find the end of the list
|
198
|
+
end_index = find_next_index(lines, not_is_list_item, index + 1)
|
199
|
+
|
200
|
+
# Extract the list content
|
201
|
+
item_lines = lines[index:end_index]
|
202
|
+
|
203
|
+
# Remove the list markers
|
204
|
+
list_content = [re.sub(regex_pattern, "", line) for line in item_lines]
|
205
|
+
|
206
|
+
return instance_class(list_content), end_index
|
207
|
+
|
208
|
+
return None
|
209
|
+
|
210
|
+
|
211
|
+
class UnorderedList(AbstractList):
|
212
|
+
def list_type(self):
|
213
|
+
return "itemize"
|
214
|
+
|
215
|
+
@staticmethod
|
216
|
+
def detect_block(
|
217
|
+
lines: Sequence[str], index: int
|
218
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
219
|
+
return AbstractList.detect_block(lines, index, r"^-\s+", UnorderedList)
|
220
|
+
|
221
|
+
|
222
|
+
class OrderedList(AbstractList):
|
223
|
+
def list_type(self):
|
224
|
+
return "enumerate"
|
225
|
+
|
226
|
+
@staticmethod
|
227
|
+
def detect_block(
|
228
|
+
lines: Sequence[str], index: int
|
229
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
230
|
+
return AbstractList.detect_block(lines, index, r"^\d+\.\s+", OrderedList)
|
231
|
+
|
232
|
+
|
233
|
+
class Quote(LaTeXBlock):
|
234
|
+
def __init__(self, content):
|
235
|
+
super().__init__(content)
|
236
|
+
self.lines = content
|
237
|
+
|
238
|
+
def formatted_text(self, **kwargs):
|
239
|
+
content = "\\begin{displayquote}\n"
|
240
|
+
|
241
|
+
for line in format_text(self.lines):
|
242
|
+
content += f"\t{line}\n"
|
243
|
+
|
244
|
+
content += "\\end{displayquote}\n"
|
245
|
+
|
246
|
+
return content
|
247
|
+
|
248
|
+
def __repr__(self):
|
249
|
+
return f"Quote(content={self.content})"
|
250
|
+
|
251
|
+
@staticmethod
|
252
|
+
def detect_block(
|
253
|
+
lines: Sequence[str], index: int
|
254
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
255
|
+
def _is_quote(line):
|
256
|
+
return line.startswith(">")
|
257
|
+
|
258
|
+
if _is_quote(lines[index]) and not any(
|
259
|
+
v in lines[index] for v in SPECIAL_CALLOUTS
|
260
|
+
):
|
261
|
+
end_index = find_next_index(
|
262
|
+
lines, lambda line: not _is_quote(line), index + 1
|
263
|
+
)
|
264
|
+
quote_lines = lines[index:end_index]
|
265
|
+
return Quote(quote_lines), end_index
|
266
|
+
|
267
|
+
return None
|
268
|
+
|
269
|
+
|
270
|
+
class AbstractCallout(LaTeXBlock):
|
271
|
+
def __init__(self, caption: str, lines: Sequence[str], configs: dict):
|
272
|
+
super().__init__(lines)
|
273
|
+
self.caption = caption.strip()
|
274
|
+
self.lines = lines
|
275
|
+
self.configs = configs
|
276
|
+
|
277
|
+
def __repr__(self):
|
278
|
+
return f'{self.__class__.__name__}(caption="{self.caption}", lines={self.lines}, configs={self.configs})'
|
279
|
+
|
280
|
+
@staticmethod
|
281
|
+
def detect_block(lines, index, callout: str, instance_class: Type["LaTeXBlock"]):
|
282
|
+
callout_pattern = re.compile(f"^>\s*\[!{callout}\]\s*(.*)\s*")
|
283
|
+
re_match = callout_pattern.match(lines[index])
|
284
|
+
|
285
|
+
if re_match is not None:
|
286
|
+
caption = re_match.group(1)
|
287
|
+
end_index = find_next_index(
|
288
|
+
lines, lambda line: not line.startswith(">"), index + 1
|
289
|
+
)
|
290
|
+
callout_lines = lines[index + 1 : end_index]
|
291
|
+
|
292
|
+
# Remove config marker from all
|
293
|
+
callout_lines = [l.replace(QUOTE_MARKER, "") for l in callout_lines]
|
294
|
+
|
295
|
+
# Might contain configurations in the callout, need to confirm
|
296
|
+
start_config_marker_index = find_next_index(
|
297
|
+
callout_lines, lambda line: line.startswith(CALLOUT_CONFIG_MARKER), 0
|
298
|
+
)
|
299
|
+
|
300
|
+
if start_config_marker_index < len(callout_lines):
|
301
|
+
# Then there could be properties
|
302
|
+
end_config_marker_index = find_next_index(
|
303
|
+
callout_lines,
|
304
|
+
lambda line: line.startswith(CALLOUT_CONFIG_MARKER),
|
305
|
+
start_config_marker_index + 1,
|
306
|
+
)
|
307
|
+
config_lines = callout_lines[
|
308
|
+
start_config_marker_index + 1 : end_config_marker_index
|
309
|
+
]
|
310
|
+
|
311
|
+
try:
|
312
|
+
configs = yaml.safe_load("\n".join(config_lines))
|
313
|
+
except:
|
314
|
+
raise ValueError(
|
315
|
+
f"Could not parse configurations in callout ({callout}): {config_lines}"
|
316
|
+
)
|
317
|
+
|
318
|
+
# Change the end of the call out lines
|
319
|
+
callout_lines = callout_lines[:start_config_marker_index]
|
320
|
+
else:
|
321
|
+
configs = {}
|
322
|
+
|
323
|
+
return instance_class(caption, callout_lines, configs), end_index
|
324
|
+
|
325
|
+
|
326
|
+
class Table(AbstractCallout):
|
327
|
+
def __init__(self, caption: str, lines: Sequence[str], configs: dict):
|
328
|
+
super().__init__(caption, lines, configs)
|
329
|
+
|
330
|
+
try:
|
331
|
+
import pandas as pd
|
332
|
+
except:
|
333
|
+
raise ImportError(
|
334
|
+
"You defined a table, but pandas is not installed. Please install pandas to use tables."
|
335
|
+
)
|
336
|
+
|
337
|
+
# Parse the table
|
338
|
+
table_content = "\n".join(format_text(self.lines))
|
339
|
+
|
340
|
+
df = (
|
341
|
+
pd.read_table(StringIO(table_content), sep="|", engine="python")
|
342
|
+
.dropna(how="all", axis=1)
|
343
|
+
.dropna(how="all")
|
344
|
+
)
|
345
|
+
|
346
|
+
# Clean up column names and content (strip leading/trailing whitespace)
|
347
|
+
df.columns = df.columns.str.strip()
|
348
|
+
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
|
349
|
+
|
350
|
+
# Step 2: Filter out any rows filled with '----' (typically separators)
|
351
|
+
df = df[~df.apply(lambda row: row.str.contains("----").any(), axis=1)]
|
352
|
+
|
353
|
+
self.df = df
|
354
|
+
|
355
|
+
# Check for latex specific configurations in the configs
|
356
|
+
position = self.configs.get("position", None)
|
357
|
+
column_format = self.configs.get(
|
358
|
+
"column_format", "l" + "r" * (len(df.columns) - 1)
|
359
|
+
)
|
360
|
+
centering = self.configs.get("centering", True)
|
361
|
+
|
362
|
+
self.latex_content = df.to_latex(
|
363
|
+
index=False,
|
364
|
+
caption=caption,
|
365
|
+
position=position,
|
366
|
+
column_format=column_format,
|
367
|
+
)
|
368
|
+
|
369
|
+
if centering:
|
370
|
+
split_latex_content = self.latex_content.split("\n")
|
371
|
+
split_latex_content.insert(1, "\\centering")
|
372
|
+
self.latex_content = "\n".join(split_latex_content)
|
373
|
+
|
374
|
+
def formatted_text(self, **kwargs):
|
375
|
+
return self.latex_content
|
376
|
+
|
377
|
+
@staticmethod
|
378
|
+
def detect_block(
|
379
|
+
lines: Sequence[str], index: int
|
380
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
381
|
+
return AbstractCallout.detect_block(lines, index, "table", Table)
|
382
|
+
|
383
|
+
|
384
|
+
class Figure(AbstractCallout):
|
385
|
+
def __init__(self, caption: str, lines: Sequence[str], configs: dict):
|
386
|
+
super().__init__(caption, lines, configs)
|
387
|
+
self.target_image = re.match(r"\s*\!\[\[(.*?)\]\]", self.lines[0])
|
388
|
+
|
389
|
+
if self.target_image is None:
|
390
|
+
raise ValueError(f"Could not find image in callout: {self.lines[0]}")
|
391
|
+
|
392
|
+
self.target_image = self.target_image.group(1)
|
393
|
+
self.target_image = self.target_image.split("|")[0]
|
394
|
+
self.target_image = self.target_image.split("/")[-1]
|
395
|
+
|
396
|
+
# Figure latex configs
|
397
|
+
self.label = self.configs.get("label", None)
|
398
|
+
self.position = self.configs.get("position", None)
|
399
|
+
self.centering = self.configs.get("centering", True)
|
400
|
+
self.width = self.configs.get("width", 0.5)
|
401
|
+
|
402
|
+
def formatted_text(self, **kwargs):
|
403
|
+
graphics_foler: Optional[Path] = kwargs.get("graphics_folder", None)
|
404
|
+
|
405
|
+
if graphics_foler is None:
|
406
|
+
raise ValueError(
|
407
|
+
"You defined a figure, but no graphics folder was provided."
|
408
|
+
)
|
409
|
+
|
410
|
+
image_path = (graphics_foler / self.target_image).resolve()
|
411
|
+
|
412
|
+
if not image_path.exists():
|
413
|
+
raise FileNotFoundError(f"Could not find image {image_path}")
|
414
|
+
|
415
|
+
content = "\\begin{figure}"
|
416
|
+
|
417
|
+
if self.position is not None:
|
418
|
+
content += f"[{self.position}]"
|
419
|
+
|
420
|
+
content += "\n"
|
421
|
+
|
422
|
+
if self.centering:
|
423
|
+
content += "\\centering\n"
|
424
|
+
|
425
|
+
content += f"\\includegraphics[width={self.width}\\textwidth]{{{image_path}}}\n"
|
426
|
+
|
427
|
+
# Format the caption, since it might contain citations
|
428
|
+
caption = format_text([self.caption])[0]
|
429
|
+
content += f"\\caption{{{caption}}}\n"
|
430
|
+
|
431
|
+
if self.label is not None:
|
432
|
+
content += f"\\label{{fig:{self.label}}}\n"
|
433
|
+
|
434
|
+
content += "\\end{figure}\n"
|
435
|
+
|
436
|
+
return content
|
437
|
+
|
438
|
+
@staticmethod
|
439
|
+
def detect_block(
|
440
|
+
lines: Sequence[str], index: int
|
441
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
442
|
+
return AbstractCallout.detect_block(lines, index, "figure", Figure)
|
443
|
+
|
444
|
+
|
445
|
+
class RawLaTeXBlock(LaTeXBlock):
|
446
|
+
def __init__(self, content):
|
447
|
+
super().__init__(content, in_latex=True)
|
448
|
+
|
449
|
+
@staticmethod
|
450
|
+
def detect_block(
|
451
|
+
lines: Sequence[str], index: int
|
452
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
453
|
+
if lines[index].startswith("```latex"):
|
454
|
+
end_index = find_next_index(
|
455
|
+
lines, lambda line: line.startswith("```"), index + 1
|
456
|
+
)
|
457
|
+
raw_lines = lines[index + 1 : end_index]
|
458
|
+
return RawLaTeXBlock(raw_lines), end_index
|
459
|
+
|
460
|
+
return None
|
461
|
+
|
462
|
+
|
463
|
+
class TikZBlock(LaTeXBlock):
|
464
|
+
def __init__(self, content):
|
465
|
+
super().__init__(content, in_latex=True)
|
466
|
+
|
467
|
+
@staticmethod
|
468
|
+
def detect_block(
|
469
|
+
lines: Sequence[str], index: int
|
470
|
+
) -> Optional[Tuple["LaTeXBlock", int]]:
|
471
|
+
if lines[index].startswith("```tikz"):
|
472
|
+
end_index = find_next_index(
|
473
|
+
lines, lambda line: line.startswith("```"), index + 1
|
474
|
+
)
|
475
|
+
raw_lines = lines[index + 1 : end_index]
|
476
|
+
tikz_content = "\n".join(raw_lines)
|
477
|
+
|
478
|
+
# Remove tikz directive that are necessary to render in obsidian
|
479
|
+
tikz_content = tikz_content.replace("\\begin{document}", "")
|
480
|
+
tikz_content = tikz_content.replace("\\end{document}", "")
|
481
|
+
|
482
|
+
# Filter out any sort of usepackage
|
483
|
+
tikz_content = re.sub(r"\\usepackage.*\n", "", tikz_content)
|
484
|
+
tikz_content = re.sub(r"\\usetikzlibrary.*\n", "", tikz_content)
|
485
|
+
|
486
|
+
return TikZBlock(tikz_content), end_index
|
487
|
+
|
488
|
+
return None
|
489
|
+
|
490
|
+
|
491
|
+
PARSEABLE_BLOCKS: Sequence[Type[LaTeXBlock]] = [
|
492
|
+
Section,
|
493
|
+
Equation,
|
494
|
+
UnorderedList,
|
495
|
+
OrderedList,
|
496
|
+
Table,
|
497
|
+
Quote,
|
498
|
+
Figure,
|
499
|
+
RawLaTeXBlock,
|
500
|
+
TikZBlock,
|
501
|
+
]
|
@@ -0,0 +1,100 @@
|
|
1
|
+
import copy
|
2
|
+
import re
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
LATEX_SPECIAL_CHARS = r"$%_}&#{"
|
6
|
+
|
7
|
+
|
8
|
+
def find_next_index(lst, expr, start=0):
|
9
|
+
for i in range(start, len(lst)):
|
10
|
+
if expr(lst[i]):
|
11
|
+
return i
|
12
|
+
return len(lst)
|
13
|
+
|
14
|
+
|
15
|
+
def detect_command(line) -> Optional[str]:
|
16
|
+
match = re.match(r"\%\%\s*(.*)\s*\%\%", line)
|
17
|
+
command = None
|
18
|
+
|
19
|
+
if match is not None:
|
20
|
+
command = match.group(1)
|
21
|
+
|
22
|
+
return command
|
23
|
+
|
24
|
+
|
25
|
+
# Function to group citations and replace
|
26
|
+
def replace_adjacent_citations(text):
|
27
|
+
# Regex pattern to match adjacent citations
|
28
|
+
pattern_adjacent = r"(\[\[@[^\]]+?\]\](\s*,\s*)*)*\[\[@[^\]]+?\]\]"
|
29
|
+
|
30
|
+
# Find all matches for adjacent citations
|
31
|
+
matches = re.finditer(pattern_adjacent, text)
|
32
|
+
|
33
|
+
# Process each match
|
34
|
+
for match in matches:
|
35
|
+
# Extract the full matched string (group of adjacent citations)
|
36
|
+
full_match = match.group(0)
|
37
|
+
|
38
|
+
# Extract individual citations and combine them
|
39
|
+
citations = re.findall(r"\[\[@([^\]]+?)\]\]", full_match)
|
40
|
+
combined_citations = ",".join(citations)
|
41
|
+
|
42
|
+
# Replace the full matched string with the combined citations
|
43
|
+
text = text.replace(full_match, f"\\citep{{{combined_citations}}}")
|
44
|
+
|
45
|
+
return text
|
46
|
+
|
47
|
+
|
48
|
+
def format_text(text_lines_origin):
|
49
|
+
# Inspired by Alejandro Daniel Noel
|
50
|
+
# In his code https://github.com/adanielnoel/Obsidian-to-latex/blob/master/parser_utils.py
|
51
|
+
# Modified by me to fit the needs of this project
|
52
|
+
text_lines = copy.deepcopy(text_lines_origin)
|
53
|
+
|
54
|
+
for i in range(len(text_lines)):
|
55
|
+
# ===== SPECIAL CHARACTERS =====
|
56
|
+
# Extract and replace by placeholders equations and links before making formatting
|
57
|
+
equations = re.findall(r"\$.*?\$", text_lines[i])
|
58
|
+
links = re.findall(r"\[\[.*?]]", text_lines[i])
|
59
|
+
codes = re.findall(r"`.*?`", text_lines[i])
|
60
|
+
text_lines[i] = re.sub(r"\$.*?\$", "<EQ-PLACEHOLDER>", text_lines[i])
|
61
|
+
text_lines[i] = re.sub(r"\[\[.*?]]", "<LINK-PLACEHOLDER>", text_lines[i])
|
62
|
+
text_lines[i] = re.sub(r"`.*?`", "<CODE-PLACEHOLDER>", text_lines[i])
|
63
|
+
# Format special chars that need to be escaped
|
64
|
+
for special_char in LATEX_SPECIAL_CHARS:
|
65
|
+
text_lines[i] = text_lines[i].replace(special_char, f"\\{special_char}")
|
66
|
+
# Put square brackets in a group so that they are not parsed in latex as block arguments
|
67
|
+
text_lines[i] = re.sub(r"(?<!\[)(\[.*])(?!])", r"{\1}", text_lines[i])
|
68
|
+
# put back equations and links
|
69
|
+
for link in links:
|
70
|
+
text_lines[i] = text_lines[i].replace(r"<LINK-PLACEHOLDER>", link, 1)
|
71
|
+
for equation in equations:
|
72
|
+
text_lines[i] = text_lines[i].replace(r"<EQ-PLACEHOLDER>", equation, 1)
|
73
|
+
for code in codes:
|
74
|
+
text_lines[i] = text_lines[i].replace(r"<CODE-PLACEHOLDER>", code, 1)
|
75
|
+
|
76
|
+
# Replace Markdown figure references by Latex references
|
77
|
+
text_lines[i] = re.sub(r"`(fig:\S*?)`", r"\\autoref{\1}", text_lines[i])
|
78
|
+
# Replace Markdown equation references by Latex references
|
79
|
+
text_lines[i] = re.sub(r"`(eq:\S*?)`", r"\\autoref{\1}", text_lines[i])
|
80
|
+
# Replace the markdown algorithm references by Latex references
|
81
|
+
text_lines[i] = re.sub(r"`(alg:\S*?)`", r"\\autoref{\1}", text_lines[i])
|
82
|
+
|
83
|
+
# ===== TEXT FORMATTING =====
|
84
|
+
# Replace Markdown monospace by latex monospace (note: do after other code blocks like refs and citations)
|
85
|
+
text_lines[i] = re.sub(r"`(.*?)`", r"\\texttt{\1}", text_lines[i])
|
86
|
+
# Replace Markdown italics with quote marks by Latex text quote
|
87
|
+
text_lines[i] = re.sub(
|
88
|
+
r'(?<!\*)\*"([^\*].*?)"\*(?!\*)', r"\\textquote{\1}", text_lines[i]
|
89
|
+
)
|
90
|
+
# Replace Markdown italics by Latex italics
|
91
|
+
text_lines[i] = re.sub(
|
92
|
+
r"(?<!\*)\*([^\*].*?)\*(?!\*)", r"\\textit{\1}", text_lines[i]
|
93
|
+
)
|
94
|
+
# Replace Markdown bold by Latex bold
|
95
|
+
text_lines[i] = re.sub(r"\*\*([^\*].*?)\*\*", r"\\textbf{\1}", text_lines[i])
|
96
|
+
# Replace Markdown highlight by Latex highlight
|
97
|
+
text_lines[i] = re.sub(r"==([^=].*?)==", r"\\hl{\1}", text_lines[i])
|
98
|
+
text_lines[i] = replace_adjacent_citations(text_lines[i])
|
99
|
+
|
100
|
+
return text_lines
|
@@ -0,0 +1,259 @@
|
|
1
|
+
import logging
|
2
|
+
import re
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Optional, Sequence, Set, Tuple
|
5
|
+
|
6
|
+
import yaml
|
7
|
+
|
8
|
+
from obsitex.planner.jobs import AddBibliography, AddHeader, AddText, PlannedJob
|
9
|
+
from obsitex.planner.links import find_all_citations, find_all_links
|
10
|
+
from obsitex.utils import assure_dir, assure_file, read_file
|
11
|
+
|
12
|
+
|
13
|
+
def parse_yaml_properties(text: str) -> Tuple[str, dict]:
|
14
|
+
properties = {}
|
15
|
+
|
16
|
+
if text.startswith("---"):
|
17
|
+
# Find the end of the properties
|
18
|
+
end_properties = text.find("---", 3)
|
19
|
+
|
20
|
+
# Remove the properties from the file contents
|
21
|
+
yaml_configs = text[3:end_properties]
|
22
|
+
|
23
|
+
# Try to load the properties, if it doesn't work, ignore
|
24
|
+
properties = yaml.safe_load(yaml_configs)
|
25
|
+
|
26
|
+
# Clean the props from the text
|
27
|
+
text = text[end_properties + 3 :].strip()
|
28
|
+
|
29
|
+
return text, properties
|
30
|
+
|
31
|
+
|
32
|
+
class ExecutionPlan:
|
33
|
+
def __init__(
|
34
|
+
self,
|
35
|
+
bibtex_database_path: Optional[Path] = None,
|
36
|
+
implictly_add_bibtex: bool = True,
|
37
|
+
):
|
38
|
+
self.bibtex_database_path = bibtex_database_path
|
39
|
+
self.implictly_add_bibtex = implictly_add_bibtex
|
40
|
+
|
41
|
+
# Check that if the paths are provided, they are valid
|
42
|
+
assure_file(self.bibtex_database_path)
|
43
|
+
|
44
|
+
# Variables to store extracted data
|
45
|
+
self._citation_keys: Set[str] = set()
|
46
|
+
self._n_files_read = 0
|
47
|
+
|
48
|
+
# Used to specify the jobs that will run in the execution plan
|
49
|
+
self._jobs: Sequence[PlannedJob] = []
|
50
|
+
|
51
|
+
@property
|
52
|
+
def n_files_read(self) -> int:
|
53
|
+
return self._n_files_read
|
54
|
+
|
55
|
+
@property
|
56
|
+
def num_headers(self) -> int:
|
57
|
+
return len([job for job in self._jobs if isinstance(job, AddHeader)])
|
58
|
+
|
59
|
+
def iter_jobs(self):
|
60
|
+
# Find the first job in the appendix
|
61
|
+
appendix_job_idx = None
|
62
|
+
|
63
|
+
for idx, job in enumerate(self._jobs):
|
64
|
+
if job.is_in_appendix:
|
65
|
+
appendix_job_idx = idx
|
66
|
+
break
|
67
|
+
|
68
|
+
if appendix_job_idx is None:
|
69
|
+
appendix_job_idx = len(self._jobs)
|
70
|
+
|
71
|
+
# Iter through the non appendix job, add bibliography job and then the appendix jobs
|
72
|
+
for job in self._jobs[:appendix_job_idx]:
|
73
|
+
yield job
|
74
|
+
|
75
|
+
# When this is called, it's assumed that all files have been added
|
76
|
+
# to the execution plan. Thus we need to check if we should add the bibliography
|
77
|
+
# here or not. - should always be the last job
|
78
|
+
if self.implictly_add_bibtex and len(self._citation_keys) > 0:
|
79
|
+
if (
|
80
|
+
self.bibtex_database_path is None
|
81
|
+
or not self.bibtex_database_path.is_file()
|
82
|
+
):
|
83
|
+
raise FileNotFoundError(
|
84
|
+
f"BibTeX database not found at {self.bibtex_database_path}, please provide a valid path if you're using citations."
|
85
|
+
)
|
86
|
+
|
87
|
+
add_bib_job = AddBibliography(
|
88
|
+
self._citation_keys, self.bibtex_database_path
|
89
|
+
)
|
90
|
+
yield add_bib_job
|
91
|
+
|
92
|
+
for job in self._jobs[appendix_job_idx:]:
|
93
|
+
job.mark_as_appendix()
|
94
|
+
yield job
|
95
|
+
|
96
|
+
def add_citations(self, text: str):
|
97
|
+
self._citation_keys.update(find_all_citations(text))
|
98
|
+
|
99
|
+
def add_file(self, file_path: Path):
|
100
|
+
assure_file(file_path)
|
101
|
+
|
102
|
+
# Read the file contents
|
103
|
+
file_contents = read_file(file_path)
|
104
|
+
self._n_files_read += 1
|
105
|
+
|
106
|
+
# Extract citations from the file
|
107
|
+
self.add_citations(file_contents)
|
108
|
+
|
109
|
+
# If exist, parse the YAML properties
|
110
|
+
try:
|
111
|
+
file_contents, properties = parse_yaml_properties(file_contents)
|
112
|
+
except:
|
113
|
+
logging.error(
|
114
|
+
f"Error parsing YAML properties from {file_path}, ignoring..."
|
115
|
+
)
|
116
|
+
properties = {}
|
117
|
+
|
118
|
+
# Single files have no deps
|
119
|
+
add_text_job = AddText(file_contents)
|
120
|
+
add_text_job.update_configs(properties)
|
121
|
+
|
122
|
+
self._jobs.append(add_text_job)
|
123
|
+
|
124
|
+
def add_dir(
|
125
|
+
self,
|
126
|
+
dir_path: Path,
|
127
|
+
index_file: Optional[str] = None,
|
128
|
+
max_depth: int = 10,
|
129
|
+
base_hlevel: int = -2,
|
130
|
+
):
|
131
|
+
assure_dir(dir_path)
|
132
|
+
|
133
|
+
if index_file is None:
|
134
|
+
index_file = "Index"
|
135
|
+
|
136
|
+
# Perform depth-first search to find all files
|
137
|
+
# Base hlevel is -1 because index doesn't produce headers
|
138
|
+
stack = [(dir_path, index_file, base_hlevel - 1, 0)]
|
139
|
+
global_configs, is_index = {}, True
|
140
|
+
|
141
|
+
while len(stack) > 0:
|
142
|
+
current_base_path, current_file, current_hlevel, current_depth = stack.pop()
|
143
|
+
|
144
|
+
if current_depth < max_depth:
|
145
|
+
file_contents = read_file(current_base_path / f"{current_file}.md")
|
146
|
+
self._n_files_read += 1
|
147
|
+
|
148
|
+
# Each file can have properties configured in YAML
|
149
|
+
properties = {}
|
150
|
+
|
151
|
+
# Find all links are remove them from the text
|
152
|
+
clean_text, links = find_all_links(file_contents)
|
153
|
+
|
154
|
+
if clean_text != "":
|
155
|
+
try:
|
156
|
+
clean_text, properties = parse_yaml_properties(clean_text)
|
157
|
+
except:
|
158
|
+
logging.error(
|
159
|
+
f"Error parsing YAML properties from {current_file}.md, ignoring..."
|
160
|
+
)
|
161
|
+
properties = {}
|
162
|
+
|
163
|
+
if is_index:
|
164
|
+
global_configs.update(properties)
|
165
|
+
|
166
|
+
if not is_index:
|
167
|
+
properties.update(global_configs)
|
168
|
+
|
169
|
+
# If not the index file, add a header
|
170
|
+
if current_hlevel >= base_hlevel:
|
171
|
+
add_header_job = AddHeader(current_file, current_hlevel)
|
172
|
+
add_header_job.update_configs(properties)
|
173
|
+
|
174
|
+
self._jobs.append(add_header_job)
|
175
|
+
|
176
|
+
if clean_text != "":
|
177
|
+
add_text_job = AddText(clean_text)
|
178
|
+
add_text_job.update_configs(properties)
|
179
|
+
|
180
|
+
self._jobs.append(add_text_job)
|
181
|
+
self.add_citations(clean_text)
|
182
|
+
|
183
|
+
for link in reversed(links):
|
184
|
+
# Might be pointing to a file in the same folder
|
185
|
+
# or a subdirectory
|
186
|
+
new_base_path = current_base_path / link
|
187
|
+
|
188
|
+
if not new_base_path.is_dir():
|
189
|
+
new_base_path = current_base_path
|
190
|
+
|
191
|
+
if not (new_base_path / f"{link}.md").is_file():
|
192
|
+
raise ValueError(
|
193
|
+
f"File {link} not found in {new_base_path}"
|
194
|
+
)
|
195
|
+
|
196
|
+
stack.append(
|
197
|
+
(new_base_path, link, current_hlevel + 1, current_depth + 1)
|
198
|
+
)
|
199
|
+
else:
|
200
|
+
raise ValueError(
|
201
|
+
f"Max depth of {max_depth} reached, please check for cycles in your links."
|
202
|
+
)
|
203
|
+
|
204
|
+
if is_index:
|
205
|
+
is_index = False
|
206
|
+
|
207
|
+
logging.info(f"Added {len(self._jobs)} jobs to the execution plan.")
|
208
|
+
|
209
|
+
def show(self, text_limit: int = 50, show_configs: bool = False):
|
210
|
+
for order, job in enumerate(self._jobs, start=1):
|
211
|
+
if isinstance(job, AddText):
|
212
|
+
if text_limit >= 0:
|
213
|
+
text_content = job.text[:text_limit]
|
214
|
+
logging.info(f"{order}. Adding text: {text_content}...")
|
215
|
+
elif isinstance(job, AddHeader):
|
216
|
+
logging.info(
|
217
|
+
f"{order}. Adding header: {job.header} with level {job.level}..."
|
218
|
+
)
|
219
|
+
else:
|
220
|
+
logging.warning(f"{order}. Unknown job type: {job}")
|
221
|
+
|
222
|
+
logging.info(f"Printing table of contents...")
|
223
|
+
|
224
|
+
if self.num_headers == 0:
|
225
|
+
hlevel_zero_adjusted = 0
|
226
|
+
else:
|
227
|
+
hlevel_zero_adjusted = min(
|
228
|
+
[job.level for job in self._jobs if isinstance(job, AddHeader)]
|
229
|
+
)
|
230
|
+
|
231
|
+
if hlevel_zero_adjusted < 0:
|
232
|
+
hlevel_zero_adjusted = -hlevel_zero_adjusted
|
233
|
+
|
234
|
+
# Used for add text jobs to know the header level above
|
235
|
+
prev_header_level = 0
|
236
|
+
header_regex = r"(#+)\s*(.+)\s*\n"
|
237
|
+
tob_content = []
|
238
|
+
|
239
|
+
for job in self._jobs:
|
240
|
+
if isinstance(job, AddHeader):
|
241
|
+
level = job.level + hlevel_zero_adjusted + 1
|
242
|
+
title = job.header
|
243
|
+
prev_header_level = level
|
244
|
+
tob_content.append((level, title, job.configs))
|
245
|
+
elif isinstance(job, AddText):
|
246
|
+
level = prev_header_level
|
247
|
+
|
248
|
+
# Find all the headers contains inside
|
249
|
+
headers_in_text = re.findall(header_regex, job.text)
|
250
|
+
|
251
|
+
for header_level, header_title in headers_in_text:
|
252
|
+
level = len(header_level) + prev_header_level
|
253
|
+
tob_content.append((level, header_title, None))
|
254
|
+
|
255
|
+
for level, header, configs in tob_content:
|
256
|
+
if configs is not None and show_configs:
|
257
|
+
logging.info(f"{' '*(level-1)}| {header} {configs}")
|
258
|
+
else:
|
259
|
+
logging.info(f"{' '*(level-1)}| {header}")
|
obsitex/planner/jobs.py
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Set
|
4
|
+
|
5
|
+
|
6
|
+
class PlannedJob(ABC):
|
7
|
+
def __init__(self):
|
8
|
+
self.configs = {}
|
9
|
+
|
10
|
+
def update_configs(self, kwargs: dict):
|
11
|
+
self.configs.update(kwargs)
|
12
|
+
|
13
|
+
@property
|
14
|
+
def is_in_appendix(self) -> bool:
|
15
|
+
return self.configs.get("appendix", False)
|
16
|
+
|
17
|
+
def mark_as_appendix(self):
|
18
|
+
self.configs["appendix"] = True
|
19
|
+
|
20
|
+
|
21
|
+
class AddText(PlannedJob):
|
22
|
+
def __init__(self, text: str):
|
23
|
+
super().__init__()
|
24
|
+
self.text = text
|
25
|
+
|
26
|
+
|
27
|
+
class AddHeader(PlannedJob):
|
28
|
+
def __init__(self, header: str, level: int):
|
29
|
+
super().__init__()
|
30
|
+
self.header = header
|
31
|
+
self.level = level
|
32
|
+
|
33
|
+
|
34
|
+
class AddBibliography(PlannedJob):
|
35
|
+
def __init__(self, citations: Set[str], bibtex_path: Path):
|
36
|
+
super().__init__()
|
37
|
+
self.citations = citations
|
38
|
+
self.bibtex_path = bibtex_path
|
obsitex/planner/links.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
import re
|
2
|
+
from typing import Any, Sequence, Set, Tuple
|
3
|
+
|
4
|
+
|
5
|
+
def find_all_citations(text: str) -> Set[Any]:
|
6
|
+
# Regex pattern to match each citation tag
|
7
|
+
citation_pattern = r"\[\[@([^\]]+?)\]\]"
|
8
|
+
matches = re.findall(citation_pattern, text)
|
9
|
+
|
10
|
+
# Return a set of unique citation keys
|
11
|
+
return set(matches)
|
12
|
+
|
13
|
+
|
14
|
+
def find_all_links(text: str) -> Tuple[str, Sequence[str]]:
|
15
|
+
link_regex = r"(?<!\!)\[\[(.*?)\]\]"
|
16
|
+
all_links = re.findall(link_regex, text)
|
17
|
+
|
18
|
+
# Filter out links starting with @, which are used for citations
|
19
|
+
all_links = [link for link in all_links if not link.startswith("@")]
|
20
|
+
resulting_links = []
|
21
|
+
|
22
|
+
for link in all_links:
|
23
|
+
if "|" in link:
|
24
|
+
resulting_links.append(link.split("|")[1])
|
25
|
+
elif "/" in link:
|
26
|
+
resulting_links.append(link.split("/")[-1])
|
27
|
+
else:
|
28
|
+
resulting_links.append(link)
|
29
|
+
|
30
|
+
# Remove links from original text
|
31
|
+
for link in all_links:
|
32
|
+
text = text.replace(f"[[{link}]]", "")
|
33
|
+
|
34
|
+
text = text.strip()
|
35
|
+
|
36
|
+
return text, resulting_links
|
obsitex/utils.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
|
5
|
+
def assure_dir(path: Optional[Path]):
|
6
|
+
if path is not None and not path.is_dir():
|
7
|
+
raise ValueError(f"Path {path} is not a directory.")
|
8
|
+
|
9
|
+
|
10
|
+
def assure_file(path: Optional[Path]):
|
11
|
+
if path is not None and not path.is_file():
|
12
|
+
raise ValueError(f"Path {path} is not a file.")
|
13
|
+
|
14
|
+
|
15
|
+
def read_file(file_path: Path) -> str:
|
16
|
+
assure_file(file_path)
|
17
|
+
|
18
|
+
with open(file_path, "r") as file:
|
19
|
+
return file.read()
|
@@ -0,0 +1,12 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: obsitex
|
3
|
+
Version: 0.0.0
|
4
|
+
Author: Rui Reis
|
5
|
+
Author-email: ruipedronetoreis12@gmail.com
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
7
|
+
Classifier: Operating System :: OS Independent
|
8
|
+
Requires-Python: >=3.8
|
9
|
+
Requires-Dist: Jinja2 >=3.1.2
|
10
|
+
Requires-Dist: bibtexparser >=1.4.3
|
11
|
+
Requires-Dist: pandas >=2.1.3
|
12
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
obsitex/__init__.py,sha256=w-YmpYAdbQPyJqO1JLEaRApy1M_FzAH7nN4VvlQku-U,42
|
2
|
+
obsitex/cli.py,sha256=otjRQIoidXecJFQs6WCPFRUv8njgieEULQzJfMZMLYY,2797
|
3
|
+
obsitex/constants.py,sha256=dgMZjokPOYHLLOvDbsOM4XiQvOK26BAHmRdeWHn8DTE,970
|
4
|
+
obsitex/utils.py,sha256=hqLRvuaQx_Y5rVY8B8lD6rbkMagoc3MBtswwdt_34f0,481
|
5
|
+
obsitex/parser/__init__.py,sha256=FPcfq6MaVpxyrhrdfWRVNSzdE9Gz9DUOh_DPlahYsEc,7494
|
6
|
+
obsitex/parser/blocks.py,sha256=q7LJcctsM32sh5iR5Bp5aa2e1kLucRwT4IGnlEsqOyA,15525
|
7
|
+
obsitex/parser/formatting.py,sha256=kfb_7PbdbW1QwE4HDbFrxkINejTOzwZqilgY1JCEZVE,4204
|
8
|
+
obsitex/planner/__init__.py,sha256=4yf4scYeQ_jNO6nQZ0BboyILoA7su0LOe0KIrnCzn6U,9207
|
9
|
+
obsitex/planner/jobs.py,sha256=UOnGMBlQFWy2nl7Zo_LcwNUiN9eqmdmhu8KeJnKWDjI,882
|
10
|
+
obsitex/planner/links.py,sha256=SJqnJzyg2EtiZgtfZhzVBpqTVUSy87YZRObgltDv10M,1045
|
11
|
+
obsitex-0.0.0.dist-info/METADATA,sha256=OQzNIbu1fSZSSJmjKewEdawlYzYsiYKMIofC2GvbSIw,326
|
12
|
+
obsitex-0.0.0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
13
|
+
obsitex-0.0.0.dist-info/entry_points.txt,sha256=0LF5Zjk-yBhL6_2o7JGiIXthwkGyhT4Snf4Waz2wRDA,45
|
14
|
+
obsitex-0.0.0.dist-info/top_level.txt,sha256=Mi24FWCjxDXVa3wNtCN0NuT_dRgyN5haGA4IE8oHeLQ,8
|
15
|
+
obsitex-0.0.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
obsitex
|