fabricatio 0.2.9.dev4__cp312-cp312-win_amd64.whl → 0.2.10__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricatio/actions/article.py +20 -106
- fabricatio/actions/article_rag.py +153 -22
- fabricatio/actions/fs.py +25 -0
- fabricatio/actions/output.py +17 -3
- fabricatio/actions/rag.py +40 -18
- fabricatio/actions/rules.py +14 -3
- fabricatio/capabilities/check.py +2 -1
- fabricatio/capabilities/rag.py +41 -231
- fabricatio/config.py +4 -2
- fabricatio/constants.py +20 -0
- fabricatio/decorators.py +23 -0
- fabricatio/models/adv_kwargs_types.py +35 -0
- fabricatio/models/events.py +6 -6
- fabricatio/models/extra/advanced_judge.py +2 -2
- fabricatio/models/extra/aricle_rag.py +170 -0
- fabricatio/models/extra/article_base.py +2 -186
- fabricatio/models/extra/article_essence.py +8 -7
- fabricatio/models/extra/article_main.py +39 -107
- fabricatio/models/extra/problem.py +12 -17
- fabricatio/models/extra/rag.py +98 -0
- fabricatio/models/extra/rule.py +1 -2
- fabricatio/models/generic.py +35 -12
- fabricatio/models/kwargs_types.py +8 -36
- fabricatio/models/task.py +3 -3
- fabricatio/models/usages.py +80 -6
- fabricatio/rust.cp312-win_amd64.pyd +0 -0
- fabricatio/rust.pyi +138 -6
- fabricatio/utils.py +62 -4
- fabricatio-0.2.10.data/scripts/tdown.exe +0 -0
- {fabricatio-0.2.9.dev4.dist-info → fabricatio-0.2.10.dist-info}/METADATA +1 -4
- fabricatio-0.2.10.dist-info/RECORD +64 -0
- fabricatio/models/utils.py +0 -148
- fabricatio-0.2.9.dev4.data/scripts/tdown.exe +0 -0
- fabricatio-0.2.9.dev4.dist-info/RECORD +0 -61
- {fabricatio-0.2.9.dev4.dist-info → fabricatio-0.2.10.dist-info}/WHEEL +0 -0
- {fabricatio-0.2.9.dev4.dist-info → fabricatio-0.2.10.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,170 @@
|
|
1
|
+
"""A Module containing the article rag models."""
|
2
|
+
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import ClassVar, Dict, List, Optional, Self, Unpack
|
5
|
+
|
6
|
+
from fabricatio.rust import BibManager, split_into_chunks, is_chinese
|
7
|
+
from more_itertools.recipes import flatten
|
8
|
+
from pydantic import Field
|
9
|
+
|
10
|
+
from fabricatio.fs import safe_text_read
|
11
|
+
from fabricatio.journal import logger
|
12
|
+
from fabricatio.models.extra.article_main import ArticleSubsection
|
13
|
+
from fabricatio.models.extra.rag import MilvusDataBase
|
14
|
+
from fabricatio.models.generic import AsPrompt
|
15
|
+
from fabricatio.models.kwargs_types import ChunkKwargs
|
16
|
+
from fabricatio.utils import ok, wrapp_in_block
|
17
|
+
|
18
|
+
|
19
|
+
class ArticleChunk(MilvusDataBase, AsPrompt):
|
20
|
+
"""The chunk of an article."""
|
21
|
+
|
22
|
+
etc_word: ClassVar[str] = "等"
|
23
|
+
and_word: ClassVar[str] = "与"
|
24
|
+
_cite_number: Optional[int] = None
|
25
|
+
|
26
|
+
head_split: ClassVar[List[str]] = [
|
27
|
+
"引 言",
|
28
|
+
"引言",
|
29
|
+
"绪 论",
|
30
|
+
"绪论",
|
31
|
+
"前言",
|
32
|
+
"INTRODUCTION",
|
33
|
+
"Introduction",
|
34
|
+
]
|
35
|
+
tail_split: ClassVar[List[str]] = [
|
36
|
+
"参 考 文 献",
|
37
|
+
"参 考 文 献",
|
38
|
+
"参考文献",
|
39
|
+
"REFERENCES",
|
40
|
+
"References",
|
41
|
+
"Bibliography",
|
42
|
+
"Reference",
|
43
|
+
]
|
44
|
+
chunk: str
|
45
|
+
"""The segment of the article"""
|
46
|
+
year: int
|
47
|
+
"""The year of the article"""
|
48
|
+
authors: List[str] = Field(default_factory=list)
|
49
|
+
"""The authors of the article"""
|
50
|
+
article_title: str
|
51
|
+
"""The title of the article"""
|
52
|
+
bibtex_cite_key: str
|
53
|
+
"""The bibtex cite key of the article"""
|
54
|
+
|
55
|
+
def _as_prompt_inner(self) -> Dict[str, str]:
|
56
|
+
return {
|
57
|
+
f"{ok(self._cite_number, 'You need to update cite number first.')}th reference `{self.article_title}`": f"{wrapp_in_block(self.chunk, 'Referring Content')}\n"
|
58
|
+
f"Authors: {';'.join(self.authors)}\n"
|
59
|
+
f"Published Year: {self.year}\n"
|
60
|
+
}
|
61
|
+
|
62
|
+
def _prepare_vectorization_inner(self) -> str:
|
63
|
+
return self.chunk
|
64
|
+
|
65
|
+
@classmethod
|
66
|
+
def from_file[P: str | Path](
|
67
|
+
cls, path: P | List[P], bib_mgr: BibManager, **kwargs: Unpack[ChunkKwargs]
|
68
|
+
) -> List[Self]:
|
69
|
+
"""Load the article chunks from the file."""
|
70
|
+
if isinstance(path, list):
|
71
|
+
result = list(flatten(cls._from_file_inner(p, bib_mgr, **kwargs) for p in path))
|
72
|
+
logger.debug(f"Number of chunks created from list of files: {len(result)}")
|
73
|
+
return result
|
74
|
+
|
75
|
+
return cls._from_file_inner(path, bib_mgr, **kwargs)
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
def _from_file_inner(cls, path: str | Path, bib_mgr: BibManager, **kwargs: Unpack[ChunkKwargs]) -> List[Self]:
|
79
|
+
path = Path(path)
|
80
|
+
|
81
|
+
title_seg = path.stem.split(" - ").pop()
|
82
|
+
|
83
|
+
key = (
|
84
|
+
bib_mgr.get_cite_key_by_title(title_seg)
|
85
|
+
or bib_mgr.get_cite_key_by_title_fuzzy(title_seg)
|
86
|
+
or bib_mgr.get_cite_key_fuzzy(path.stem)
|
87
|
+
)
|
88
|
+
if key is None:
|
89
|
+
logger.warning(f"no cite key found for {path.as_posix()}, skip.")
|
90
|
+
return []
|
91
|
+
authors = ok(bib_mgr.get_author_by_key(key), f"no author found for {key}")
|
92
|
+
year = ok(bib_mgr.get_year_by_key(key), f"no year found for {key}")
|
93
|
+
article_title = ok(bib_mgr.get_title_by_key(key), f"no title found for {key}")
|
94
|
+
|
95
|
+
result = [
|
96
|
+
cls(chunk=c, year=year, authors=authors, article_title=article_title, bibtex_cite_key=key)
|
97
|
+
for c in split_into_chunks(cls.strip(safe_text_read(path)), **kwargs)
|
98
|
+
]
|
99
|
+
logger.debug(f"Number of chunks created from file {path.as_posix()}: {len(result)}")
|
100
|
+
return result
|
101
|
+
|
102
|
+
@classmethod
|
103
|
+
def strip(cls, string: str) -> str:
|
104
|
+
"""Strip the head and tail of the string."""
|
105
|
+
logger.debug(f"String length before strip: {(original := len(string))}")
|
106
|
+
for split in (s for s in cls.head_split if s in string):
|
107
|
+
logger.debug(f"Strip head using {split}")
|
108
|
+
parts = string.split(split)
|
109
|
+
string = split.join(parts[1:]) if len(parts) > 1 else parts[0]
|
110
|
+
break
|
111
|
+
logger.debug(
|
112
|
+
f"String length after head strip: {(stripped_len := len(string))}, decreased by {(d := original - stripped_len)}"
|
113
|
+
)
|
114
|
+
if not d:
|
115
|
+
logger.warning("No decrease at head strip, which is might be abnormal.")
|
116
|
+
for split in (s for s in cls.tail_split if s in string):
|
117
|
+
logger.debug(f"Strip tail using {split}")
|
118
|
+
parts = string.split(split)
|
119
|
+
string = split.join(parts[:-1]) if len(parts) > 1 else parts[0]
|
120
|
+
break
|
121
|
+
logger.debug(f"String length after tail strip: {len(string)}, decreased by {(d := stripped_len - len(string))}")
|
122
|
+
if not d:
|
123
|
+
logger.warning("No decrease at tail strip, which is might be abnormal.")
|
124
|
+
|
125
|
+
return string
|
126
|
+
|
127
|
+
def as_typst_cite(self) -> str:
|
128
|
+
"""As typst cite."""
|
129
|
+
return f"#cite(<{self.bibtex_cite_key}>)"
|
130
|
+
|
131
|
+
@property
|
132
|
+
def auther_firstnames(self) -> List[str]:
|
133
|
+
"""Get the first name of the authors."""
|
134
|
+
ret = []
|
135
|
+
for n in self.authors:
|
136
|
+
if is_chinese(n):
|
137
|
+
ret.append(n[0])
|
138
|
+
else:
|
139
|
+
ret.append(n.split()[-1])
|
140
|
+
return ret
|
141
|
+
|
142
|
+
def as_auther_seq(self) -> str:
|
143
|
+
"""Get the auther sequence."""
|
144
|
+
match len(self.authors):
|
145
|
+
case 0:
|
146
|
+
raise ValueError("No authors found")
|
147
|
+
case 1:
|
148
|
+
return f"({self.auther_firstnames[0]},{self.year}){self.as_typst_cite()}"
|
149
|
+
case 2:
|
150
|
+
return f"({self.auther_firstnames[0]}{self.and_word}{self.auther_firstnames[1]},{self.year}){self.as_typst_cite()}"
|
151
|
+
case 3:
|
152
|
+
return f"({self.auther_firstnames[0]},{self.auther_firstnames[1]}{self.and_word}{self.auther_firstnames[2]},{self.year}){self.as_typst_cite()}"
|
153
|
+
case _:
|
154
|
+
return f"({self.auther_firstnames[0]},{self.auther_firstnames[1]}{self.and_word}{self.auther_firstnames[2]}{self.etc_word},{self.year}){self.as_typst_cite()}"
|
155
|
+
|
156
|
+
def update_cite_number(self, cite_number: int) -> Self:
|
157
|
+
"""Update the cite number."""
|
158
|
+
self._cite_number = cite_number
|
159
|
+
return self
|
160
|
+
|
161
|
+
def replace_cite(self, string: str, left_char: str = "[[", right_char: str = "]]") -> str:
|
162
|
+
"""Replace the cite number in the string."""
|
163
|
+
return string.replace(f"{left_char}{ok(self._cite_number)}{right_char}", self.as_auther_seq())
|
164
|
+
|
165
|
+
def apply(self, article_subsection: ArticleSubsection) -> ArticleSubsection:
|
166
|
+
"""Apply the patch to the article subsection."""
|
167
|
+
for p in article_subsection.paragraphs:
|
168
|
+
p.content = self.replace_cite(p.content)
|
169
|
+
|
170
|
+
return article_subsection
|
@@ -2,8 +2,7 @@
|
|
2
2
|
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from enum import StrEnum
|
5
|
-
from
|
6
|
-
from typing import Generator, List, Optional, Self, Tuple, overload
|
5
|
+
from typing import Generator, List, Optional, Self, Tuple
|
7
6
|
|
8
7
|
from fabricatio.models.generic import (
|
9
8
|
AsPrompt,
|
@@ -15,7 +14,6 @@ from fabricatio.models.generic import (
|
|
15
14
|
PersistentAble,
|
16
15
|
ProposedUpdateAble,
|
17
16
|
ResolveUpdateConflict,
|
18
|
-
SequencePatch,
|
19
17
|
SketchedAble,
|
20
18
|
Titled,
|
21
19
|
WordCount,
|
@@ -31,81 +29,9 @@ class ReferringType(StrEnum):
|
|
31
29
|
SUBSECTION = "subsection"
|
32
30
|
|
33
31
|
|
34
|
-
type RefKey = Tuple[str, Optional[str], Optional[str]]
|
35
|
-
|
36
|
-
|
37
|
-
class ArticleRef(ProposedUpdateAble):
|
38
|
-
"""Reference to a specific chapter, section or subsection within the article. You SHALL not refer to an article component that is external and not present within our own article.
|
39
|
-
|
40
|
-
Examples:
|
41
|
-
- Referring to a chapter titled `Introduction`:
|
42
|
-
Using Python
|
43
|
-
```python
|
44
|
-
ArticleRef(chap="Introduction")
|
45
|
-
```
|
46
|
-
Using JSON
|
47
|
-
```json
|
48
|
-
{chap="Introduction"}
|
49
|
-
```
|
50
|
-
- Referring to a section titled `Background` under the `Introduction` chapter:
|
51
|
-
Using Python
|
52
|
-
```python
|
53
|
-
ArticleRef(chap="Introduction", sec="Background")
|
54
|
-
```
|
55
|
-
Using JSON
|
56
|
-
```json
|
57
|
-
{chap="Introduction", sec="Background"}
|
58
|
-
```
|
59
|
-
- Referring to a subsection titled `Related Work` under the `Background` section of the `Introduction` chapter:
|
60
|
-
Using Python
|
61
|
-
```python
|
62
|
-
ArticleRef(chap="Introduction", sec="Background", subsec="Related Work")
|
63
|
-
```
|
64
|
-
Using JSON
|
65
|
-
```json
|
66
|
-
{chap="Introduction", sec="Background", subsec="Related Work"}
|
67
|
-
```
|
68
|
-
"""
|
69
|
-
|
70
|
-
chap: str
|
71
|
-
"""`title` Field of the referenced chapter"""
|
72
|
-
sec: Optional[str] = None
|
73
|
-
"""`title` Field of the referenced section."""
|
74
|
-
subsec: Optional[str] = None
|
75
|
-
"""`title` Field of the referenced subsection."""
|
76
|
-
|
77
|
-
def update_from_inner(self, other: Self) -> Self:
|
78
|
-
"""Updates the current instance with the attributes of another instance."""
|
79
|
-
self.chap = other.chap
|
80
|
-
self.sec = other.sec
|
81
|
-
self.subsec = other.subsec
|
82
|
-
return self
|
83
|
-
|
84
|
-
def deref(self, article: "ArticleBase") -> Optional["ArticleOutlineBase"]:
|
85
|
-
"""Dereference the reference to the actual section or subsection within the provided article.
|
86
|
-
|
87
|
-
Args:
|
88
|
-
article (ArticleOutline | Article): The article to dereference the reference from.
|
89
32
|
|
90
|
-
|
91
|
-
ArticleMainBase | ArticleOutline | None: The dereferenced section or subsection, or None if not found.
|
92
|
-
"""
|
93
|
-
chap = next((chap for chap in article.chapters if chap.title == self.chap), None)
|
94
|
-
if self.sec is None or chap is None:
|
95
|
-
return chap
|
96
|
-
sec = next((sec for sec in chap.sections if sec.title == self.sec), None)
|
97
|
-
if self.subsec is None or sec is None:
|
98
|
-
return sec
|
99
|
-
return next((subsec for subsec in sec.subsections if subsec.title == self.subsec), None)
|
33
|
+
type RefKey = Tuple[str, Optional[str], Optional[str]]
|
100
34
|
|
101
|
-
@property
|
102
|
-
def referring_type(self) -> ReferringType:
|
103
|
-
"""Determine the type of reference based on the presence of specific attributes."""
|
104
|
-
if self.subsec is not None:
|
105
|
-
return ReferringType.SUBSECTION
|
106
|
-
if self.sec is not None:
|
107
|
-
return ReferringType.SECTION
|
108
|
-
return ReferringType.CHAPTER
|
109
35
|
|
110
36
|
|
111
37
|
class ArticleMetaData(SketchedAble, Described, WordCount, Titled, Language):
|
@@ -121,15 +47,8 @@ class ArticleMetaData(SketchedAble, Described, WordCount, Titled, Language):
|
|
121
47
|
aims: List[str]
|
122
48
|
"""List of writing aims of the research component in academic style."""
|
123
49
|
|
124
|
-
support_to: List[ArticleRef]
|
125
|
-
"""List of references to other future components in this article that this component supports to."""
|
126
|
-
depend_on: List[ArticleRef]
|
127
|
-
"""List of references to other previous components in this article that this component depends on."""
|
128
50
|
|
129
51
|
|
130
|
-
class ArticleRefSequencePatch(SequencePatch[ArticleRef]):
|
131
|
-
"""Patch for article refs."""
|
132
|
-
|
133
52
|
|
134
53
|
class ArticleOutlineBase(
|
135
54
|
ArticleMetaData,
|
@@ -148,10 +67,6 @@ class ArticleOutlineBase(
|
|
148
67
|
|
149
68
|
def update_metadata(self, other: ArticleMetaData) -> Self:
|
150
69
|
"""Updates the metadata of the current instance with the attributes of another instance."""
|
151
|
-
self.support_to.clear()
|
152
|
-
self.support_to.extend(other.support_to)
|
153
|
-
self.depend_on.clear()
|
154
|
-
self.depend_on.extend(other.depend_on)
|
155
70
|
self.aims.clear()
|
156
71
|
self.aims.extend(other.aims)
|
157
72
|
self.description = other.description
|
@@ -319,34 +234,6 @@ class ArticleBase[T: ChapterBase](FinalizedDumpAble, AsPrompt, WordCount, Descri
|
|
319
234
|
yield sec
|
320
235
|
yield from sec.subsections
|
321
236
|
|
322
|
-
def iter_support_on(self, rev: bool = False) -> Generator[ArticleRef, None, None]:
|
323
|
-
"""Iterates over all references that the article components support.
|
324
|
-
|
325
|
-
Args:
|
326
|
-
rev (bool): If True, iterate in reverse order.
|
327
|
-
|
328
|
-
Yields:
|
329
|
-
ArticleRef: Each reference that the article components support.
|
330
|
-
"""
|
331
|
-
if rev:
|
332
|
-
yield from chain(*[a.support_to for a in self.iter_dfs_rev()])
|
333
|
-
return
|
334
|
-
yield from chain(*[a.support_to for a in self.iter_dfs()])
|
335
|
-
|
336
|
-
def iter_depend_on(self, rev: bool = False) -> Generator[ArticleRef, None, None]:
|
337
|
-
"""Iterates over all references that the article components depend on.
|
338
|
-
|
339
|
-
Args:
|
340
|
-
rev (bool): If True, iterate in reverse order.
|
341
|
-
|
342
|
-
Yields:
|
343
|
-
ArticleRef: Each reference that the article components depend on.
|
344
|
-
"""
|
345
|
-
if rev:
|
346
|
-
yield from chain(*[a.depend_on for a in self.iter_dfs_rev()])
|
347
|
-
return
|
348
|
-
yield from chain(*[a.depend_on for a in self.iter_dfs()])
|
349
|
-
|
350
237
|
def iter_sections(self) -> Generator[Tuple[ChapterBase, SectionBase], None, None]:
|
351
238
|
"""Iterates through all sections in the article.
|
352
239
|
|
@@ -380,12 +267,6 @@ class ArticleBase[T: ChapterBase](FinalizedDumpAble, AsPrompt, WordCount, Descri
|
|
380
267
|
"""Gathers all introspected components in the article structure."""
|
381
268
|
return "\n".join([i for component in self.chapters if (i := component.introspect())])
|
382
269
|
|
383
|
-
@overload
|
384
|
-
def find_illegal_ref(self, gather_identical: bool) -> Optional[Tuple[ArticleRef | List[ArticleRef], str]]: ...
|
385
|
-
|
386
|
-
@overload
|
387
|
-
def find_illegal_ref(self) -> Optional[Tuple[ArticleRef, str]]: ...
|
388
|
-
|
389
270
|
def iter_chap_title(self) -> Generator[str, None, None]:
|
390
271
|
"""Iterates through all chapter titles in the article."""
|
391
272
|
for chap in self.chapters:
|
@@ -401,71 +282,6 @@ class ArticleBase[T: ChapterBase](FinalizedDumpAble, AsPrompt, WordCount, Descri
|
|
401
282
|
for _, _, subsec in self.iter_subsections():
|
402
283
|
yield subsec.title
|
403
284
|
|
404
|
-
def find_illegal_ref(self, gather_identical: bool = False) -> Optional[Tuple[ArticleRef | List[ArticleRef], str]]:
|
405
|
-
"""Finds the first illegal component in the outline.
|
406
|
-
|
407
|
-
Returns:
|
408
|
-
Tuple[ArticleOutlineBase, str]: A tuple containing the illegal component and an error message.
|
409
|
-
"""
|
410
|
-
summary = ""
|
411
|
-
chap_titles_set = set(self.iter_chap_title())
|
412
|
-
sec_titles_set = set(self.iter_section_title())
|
413
|
-
subsec_titles_set = set(self.iter_subsection_title())
|
414
|
-
|
415
|
-
for component in self.iter_dfs_rev():
|
416
|
-
for ref in chain(component.depend_on, component.support_to):
|
417
|
-
if not ref.deref(self):
|
418
|
-
summary += f"Invalid internal reference in `{component.__class__.__name__}` titled `{component.title}`, because the referred {ref.referring_type} is not exists within the article, see the original obj dump: {ref.model_dump()}\n"
|
419
|
-
|
420
|
-
if ref.chap not in (chap_titles_set):
|
421
|
-
summary += f"Chapter titled `{ref.chap}` is not any of {chap_titles_set}\n"
|
422
|
-
if ref.sec and ref.sec not in (sec_titles_set):
|
423
|
-
summary += f"Section Titled `{ref.sec}` is not any of {sec_titles_set}\n"
|
424
|
-
if ref.subsec and ref.subsec not in (subsec_titles_set):
|
425
|
-
summary += f"Subsection Titled `{ref.subsec}` is not any of {subsec_titles_set}"
|
426
|
-
|
427
|
-
if summary:
|
428
|
-
return (
|
429
|
-
(
|
430
|
-
[
|
431
|
-
identical_ref
|
432
|
-
for identical_ref in chain(self.iter_depend_on(), self.iter_support_on())
|
433
|
-
if identical_ref == ref
|
434
|
-
],
|
435
|
-
summary,
|
436
|
-
)
|
437
|
-
if gather_identical
|
438
|
-
else (ref, summary)
|
439
|
-
)
|
440
|
-
|
441
|
-
return None
|
442
|
-
|
443
|
-
def gather_illegal_ref(self) -> Tuple[List[ArticleRef], str]:
|
444
|
-
"""Gathers all illegal references in the article."""
|
445
|
-
summary = []
|
446
|
-
chap_titles_set = set(self.iter_chap_title())
|
447
|
-
sec_titles_set = set(self.iter_section_title())
|
448
|
-
subsec_titles_set = set(self.iter_subsection_title())
|
449
|
-
res_seq = []
|
450
|
-
|
451
|
-
for component in self.iter_dfs():
|
452
|
-
for ref in (
|
453
|
-
r for r in chain(component.depend_on, component.support_to) if not r.deref(self) and r not in res_seq
|
454
|
-
):
|
455
|
-
res_seq.append(ref)
|
456
|
-
if ref.chap not in chap_titles_set:
|
457
|
-
summary.append(
|
458
|
-
f"Chapter titled `{ref.chap}` is not exist, since it is not any of {chap_titles_set}."
|
459
|
-
)
|
460
|
-
if ref.sec and (ref.sec not in sec_titles_set):
|
461
|
-
summary.append(f"Section Titled `{ref.sec}` is not exist, since it is not any of {sec_titles_set}")
|
462
|
-
if ref.subsec and (ref.subsec not in subsec_titles_set):
|
463
|
-
summary.append(
|
464
|
-
f"Subsection Titled `{ref.subsec}` is not exist, since it is not any of {subsec_titles_set}"
|
465
|
-
)
|
466
|
-
|
467
|
-
return res_seq, "\n".join(summary)
|
468
|
-
|
469
285
|
def finalized_dump(self) -> str:
|
470
286
|
"""Generates standardized hierarchical markup for academic publishing systems.
|
471
287
|
|
@@ -1,8 +1,9 @@
|
|
1
1
|
"""ArticleEssence: Semantic fingerprint of academic paper for structured analysis."""
|
2
2
|
|
3
|
-
from typing import List
|
3
|
+
from typing import List
|
4
4
|
|
5
|
-
from fabricatio.models.
|
5
|
+
from fabricatio.models.extra.rag import MilvusDataBase
|
6
|
+
from fabricatio.models.generic import PersistentAble, SketchedAble
|
6
7
|
from pydantic import BaseModel
|
7
8
|
|
8
9
|
|
@@ -54,7 +55,7 @@ class Highlightings(BaseModel):
|
|
54
55
|
"""
|
55
56
|
|
56
57
|
|
57
|
-
class ArticleEssence(
|
58
|
+
class ArticleEssence(SketchedAble, PersistentAble, MilvusDataBase):
|
58
59
|
"""Structured representation of a scientific article's core elements in its original language."""
|
59
60
|
|
60
61
|
language: str
|
@@ -93,7 +94,7 @@ class ArticleEssence(ProposedAble, Display, PersistentAble, Vectorizable):
|
|
93
94
|
bibtex_cite_key: str
|
94
95
|
"""Bibtex cite key of the original article."""
|
95
96
|
|
96
|
-
def
|
97
|
-
|
98
|
-
|
99
|
-
|
97
|
+
def _prepare_vectorization_inner(self) -> str:
|
98
|
+
return self.compact()
|
99
|
+
|
100
|
+
|
@@ -1,13 +1,14 @@
|
|
1
1
|
"""ArticleBase and ArticleSubsection classes for managing hierarchical document components."""
|
2
2
|
|
3
|
-
from itertools import chain
|
4
3
|
from typing import Dict, Generator, List, Self, Tuple, override
|
5
4
|
|
5
|
+
from fabricatio.rust import word_count, convert_all_block_tex, convert_all_inline_tex
|
6
|
+
from pydantic import Field
|
7
|
+
|
6
8
|
from fabricatio.fs.readers import extract_sections
|
7
9
|
from fabricatio.journal import logger
|
8
10
|
from fabricatio.models.extra.article_base import (
|
9
11
|
ArticleBase,
|
10
|
-
ArticleOutlineBase,
|
11
12
|
ChapterBase,
|
12
13
|
SectionBase,
|
13
14
|
SubSectionBase,
|
@@ -16,9 +17,6 @@ from fabricatio.models.extra.article_outline import (
|
|
16
17
|
ArticleOutline,
|
17
18
|
)
|
18
19
|
from fabricatio.models.generic import Described, PersistentAble, SequencePatch, SketchedAble, WithRef, WordCount
|
19
|
-
from fabricatio.rust import detect_language, word_count
|
20
|
-
from fabricatio.utils import ok
|
21
|
-
from pydantic import Field
|
22
20
|
|
23
21
|
PARAGRAPH_SEP = "// - - -"
|
24
22
|
|
@@ -66,10 +64,11 @@ class ArticleSubsection(SubSectionBase):
|
|
66
64
|
summary = ""
|
67
65
|
if len(self.paragraphs) == 0:
|
68
66
|
summary += f"`{self.__class__.__name__}` titled `{self.title}` have no paragraphs, You should add some!\n"
|
69
|
-
if
|
70
|
-
|
71
|
-
|
72
|
-
|
67
|
+
if (
|
68
|
+
abs((wc := self.word_count) - self.expected_word_count) / self.expected_word_count
|
69
|
+
> self._max_word_count_deviation
|
70
|
+
):
|
71
|
+
summary += f"`{self.__class__.__name__}` titled `{self.title}` have {wc} words, expected {self.expected_word_count} words!"
|
73
72
|
|
74
73
|
return summary
|
75
74
|
|
@@ -90,17 +89,14 @@ class ArticleSubsection(SubSectionBase):
|
|
90
89
|
return f"=== {self.title}\n" + f"\n{PARAGRAPH_SEP}\n".join(p.content for p in self.paragraphs)
|
91
90
|
|
92
91
|
@classmethod
|
93
|
-
def from_typst_code(cls, title: str, body: str
|
92
|
+
def from_typst_code(cls, title: str, body: str) -> Self:
|
94
93
|
"""Creates an Article object from the given Typst code."""
|
95
94
|
return cls(
|
96
95
|
heading=title,
|
97
96
|
elaboration="",
|
98
97
|
paragraphs=[Paragraph.from_content(p) for p in body.split(PARAGRAPH_SEP)],
|
99
98
|
expected_word_count=word_count(body),
|
100
|
-
language=language,
|
101
99
|
aims=[],
|
102
|
-
support_to=[],
|
103
|
-
depend_on=[],
|
104
100
|
)
|
105
101
|
|
106
102
|
|
@@ -108,20 +104,16 @@ class ArticleSection(SectionBase[ArticleSubsection]):
|
|
108
104
|
"""Atomic argumentative unit with high-level specificity."""
|
109
105
|
|
110
106
|
@classmethod
|
111
|
-
def from_typst_code(cls, title: str, body: str
|
107
|
+
def from_typst_code(cls, title: str, body: str) -> Self:
|
112
108
|
"""Creates an Article object from the given Typst code."""
|
113
109
|
return cls(
|
114
110
|
subsections=[
|
115
|
-
ArticleSubsection.from_typst_code(*pack,
|
116
|
-
for pack in extract_sections(body, level=3, section_char="=")
|
111
|
+
ArticleSubsection.from_typst_code(*pack) for pack in extract_sections(body, level=3, section_char="=")
|
117
112
|
],
|
118
113
|
heading=title,
|
119
114
|
elaboration="",
|
120
115
|
expected_word_count=word_count(body),
|
121
|
-
language=language,
|
122
116
|
aims=[],
|
123
|
-
support_to=[],
|
124
|
-
depend_on=[],
|
125
117
|
)
|
126
118
|
|
127
119
|
|
@@ -129,20 +121,16 @@ class ArticleChapter(ChapterBase[ArticleSection]):
|
|
129
121
|
"""Thematic progression implementing research function."""
|
130
122
|
|
131
123
|
@classmethod
|
132
|
-
def from_typst_code(cls, title: str, body: str
|
124
|
+
def from_typst_code(cls, title: str, body: str) -> Self:
|
133
125
|
"""Creates an Article object from the given Typst code."""
|
134
126
|
return cls(
|
135
127
|
sections=[
|
136
|
-
ArticleSection.from_typst_code(*pack,
|
137
|
-
for pack in extract_sections(body, level=2, section_char="=")
|
128
|
+
ArticleSection.from_typst_code(*pack) for pack in extract_sections(body, level=2, section_char="=")
|
138
129
|
],
|
139
130
|
heading=title,
|
140
131
|
elaboration="",
|
141
132
|
expected_word_count=word_count(body),
|
142
|
-
language=language,
|
143
133
|
aims=[],
|
144
|
-
support_to=[],
|
145
|
-
depend_on=[],
|
146
134
|
)
|
147
135
|
|
148
136
|
|
@@ -166,6 +154,22 @@ class Article(
|
|
166
154
|
"Original Article": self.display(),
|
167
155
|
}
|
168
156
|
|
157
|
+
def convert_tex(self) -> Self:
|
158
|
+
"""Convert tex to typst code"""
|
159
|
+
for _, _, subsec in self.iter_subsections():
|
160
|
+
for p in subsec.paragraphs:
|
161
|
+
p.content = convert_all_inline_tex(p.content)
|
162
|
+
p.content = convert_all_block_tex(p.content)
|
163
|
+
return self
|
164
|
+
|
165
|
+
def fix_wrapper(self) -> Self:
|
166
|
+
"""Fix wrapper"""
|
167
|
+
for _, _, subsec in self.iter_subsections():
|
168
|
+
for p in subsec.paragraphs:
|
169
|
+
p.content = p.content.replace(r" \( ", "$").replace(r" \) ", "$").replace("\\[\n", "$$\n").replace(
|
170
|
+
"\n\\]", "\n$$")
|
171
|
+
return self
|
172
|
+
|
169
173
|
@override
|
170
174
|
def iter_subsections(self) -> Generator[Tuple[ArticleChapter, ArticleSection, ArticleSubsection], None, None]:
|
171
175
|
return super().iter_subsections() # pyright: ignore [reportReturnType]
|
@@ -210,92 +214,20 @@ class Article(
|
|
210
214
|
def from_typst_code(cls, title: str, body: str) -> Self:
|
211
215
|
"""Generates an article from the given Typst code."""
|
212
216
|
return cls(
|
213
|
-
language=(lang := detect_language(body)),
|
214
217
|
chapters=[
|
215
|
-
ArticleChapter.from_typst_code(*pack,
|
216
|
-
for pack in extract_sections(body, level=1, section_char="=")
|
218
|
+
ArticleChapter.from_typst_code(*pack) for pack in extract_sections(body, level=1, section_char="=")
|
217
219
|
],
|
218
220
|
heading=title,
|
219
221
|
expected_word_count=word_count(body),
|
220
222
|
abstract="",
|
221
223
|
)
|
222
224
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
if article in {ok(b.deref(self)) for b in a.support_to}:
|
233
|
-
supports.append(a)
|
234
|
-
|
235
|
-
return list(set(depends + supports))
|
236
|
-
|
237
|
-
def gather_dependencies_recursive(self, article: ArticleOutlineBase) -> List[ArticleOutlineBase]:
|
238
|
-
"""Gathers all dependencies recursively for the given article.
|
239
|
-
|
240
|
-
Args:
|
241
|
-
article (ArticleOutlineBase): The article to gather dependencies for.
|
242
|
-
|
243
|
-
Returns:
|
244
|
-
List[ArticleBase]: A list of all dependencies for the given article.
|
245
|
-
"""
|
246
|
-
q = self.gather_dependencies(article)
|
247
|
-
|
248
|
-
deps = []
|
249
|
-
while q:
|
250
|
-
a = q.pop()
|
251
|
-
deps.extend(self.gather_dependencies(a))
|
252
|
-
|
253
|
-
deps = list(
|
254
|
-
chain(
|
255
|
-
filter(lambda x: isinstance(x, ArticleChapter), deps),
|
256
|
-
filter(lambda x: isinstance(x, ArticleSection), deps),
|
257
|
-
filter(lambda x: isinstance(x, ArticleSubsection), deps),
|
258
|
-
)
|
259
|
-
)
|
260
|
-
|
261
|
-
# Initialize result containers
|
262
|
-
formatted_code = ""
|
263
|
-
processed_components = []
|
264
|
-
|
265
|
-
# Process all dependencies
|
266
|
-
while deps:
|
267
|
-
component = deps.pop()
|
268
|
-
# Skip duplicates
|
269
|
-
if (component_code := component.to_typst_code()) in formatted_code:
|
270
|
-
continue
|
271
|
-
|
272
|
-
# Add this component
|
273
|
-
formatted_code += component_code
|
274
|
-
processed_components.append(component)
|
275
|
-
|
276
|
-
return processed_components
|
277
|
-
|
278
|
-
def iter_dfs_with_deps(
|
279
|
-
self, chapter: bool = True, section: bool = True, subsection: bool = True
|
280
|
-
) -> Generator[Tuple[ArticleOutlineBase, List[ArticleOutlineBase]], None, None]:
|
281
|
-
"""Iterates through the article in a depth-first manner, yielding each component and its dependencies.
|
282
|
-
|
283
|
-
Args:
|
284
|
-
chapter (bool, optional): Whether to include chapter components. Defaults to True.
|
285
|
-
section (bool, optional): Whether to include section components. Defaults to True.
|
286
|
-
subsection (bool, optional): Whether to include subsection components. Defaults to True.
|
287
|
-
|
288
|
-
Yields:
|
289
|
-
Tuple[ArticleBase, List[ArticleBase]]: Each component and its dependencies.
|
290
|
-
"""
|
291
|
-
if all((not chapter, not section, not subsection)):
|
292
|
-
raise ValueError("At least one of chapter, section, or subsection must be True.")
|
293
|
-
|
294
|
-
for component in self.iter_dfs_rev():
|
295
|
-
if not chapter and isinstance(component, ArticleChapter):
|
296
|
-
continue
|
297
|
-
if not section and isinstance(component, ArticleSection):
|
298
|
-
continue
|
299
|
-
if not subsection and isinstance(component, ArticleSubsection):
|
300
|
-
continue
|
301
|
-
yield component, (self.gather_dependencies_recursive(component))
|
225
|
+
@classmethod
|
226
|
+
def from_mixed_source(cls, article_outline: ArticleOutline, typst_code: str) -> Self:
|
227
|
+
"""Generates an article from the given outline and Typst code."""
|
228
|
+
self = cls.from_typst_code(article_outline.title, typst_code)
|
229
|
+
self.expected_word_count = article_outline.expected_word_count
|
230
|
+
self.description = article_outline.description
|
231
|
+
for a, o in zip(self.iter_dfs(), article_outline.iter_dfs(), strict=True):
|
232
|
+
a.update_metadata(o)
|
233
|
+
return self.update_ref(article_outline)
|