cloudnoteslib 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloudnoteslib/__init__.py +128 -0
- cloudnoteslib/analyzers/__init__.py +14 -0
- cloudnoteslib/analyzers/content_analyzer.py +180 -0
- cloudnoteslib/analyzers/search.py +143 -0
- cloudnoteslib/analyzers/statistics.py +88 -0
- cloudnoteslib/config.py +28 -0
- cloudnoteslib/exceptions.py +19 -0
- cloudnoteslib/exporters/__init__.py +11 -0
- cloudnoteslib/exporters/base.py +31 -0
- cloudnoteslib/exporters/json_exporter.py +19 -0
- cloudnoteslib/exporters/markdown_exporter.py +28 -0
- cloudnoteslib/models/__init__.py +18 -0
- cloudnoteslib/models/note.py +323 -0
- cloudnoteslib/models/note_collection.py +233 -0
- cloudnoteslib/models/tag.py +129 -0
- cloudnoteslib/processors/__init__.py +36 -0
- cloudnoteslib/processors/base.py +157 -0
- cloudnoteslib/processors/markdown_processor.py +157 -0
- cloudnoteslib/processors/plaintext_processor.py +103 -0
- cloudnoteslib/processors/richtext_processor.py +122 -0
- cloudnoteslib/security/__init__.py +12 -0
- cloudnoteslib/security/encryptor.py +81 -0
- cloudnoteslib/security/sanitizer.py +56 -0
- cloudnoteslib-0.1.0.dist-info/METADATA +37 -0
- cloudnoteslib-0.1.0.dist-info/RECORD +27 -0
- cloudnoteslib-0.1.0.dist-info/WHEEL +5 -0
- cloudnoteslib-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cloudnoteslib.models.tag — Tag Data Model.
|
|
3
|
+
|
|
4
|
+
A lightweight model representing a categorization tag. Tags are normalized
|
|
5
|
+
to lowercase and stripped of whitespace to ensure consistent matching.
|
|
6
|
+
|
|
7
|
+
Demonstrates ENCAPSULATION through private attributes with controlled access.
|
|
8
|
+
|
|
9
|
+
Example:
|
|
10
|
+
>>> tag = Tag("Work")
|
|
11
|
+
>>> tag.name
|
|
12
|
+
'work'
|
|
13
|
+
>>> tag.to_dict()
|
|
14
|
+
{'tag_id': None, 'name': 'work', 'color': '#6366f1'}
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Tag:
|
|
21
|
+
"""
|
|
22
|
+
Represents a categorization tag for notes.
|
|
23
|
+
|
|
24
|
+
Tags are normalized (lowercased, stripped) on creation to ensure
|
|
25
|
+
consistent matching across the system. Each tag can optionally
|
|
26
|
+
have a color hex code for UI rendering.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
name (str): Normalized tag name (lowercase).
|
|
30
|
+
color (str): Hex color code for UI display.
|
|
31
|
+
tag_id (int, optional): Database identifier.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# Default color palette for auto-assignment
|
|
35
|
+
DEFAULT_COLORS = [
|
|
36
|
+
"#6366f1", # Indigo
|
|
37
|
+
"#8b5cf6", # Violet
|
|
38
|
+
"#ec4899", # Pink
|
|
39
|
+
"#f43f5e", # Rose
|
|
40
|
+
"#f97316", # Orange
|
|
41
|
+
"#eab308", # Yellow
|
|
42
|
+
"#22c55e", # Green
|
|
43
|
+
"#06b6d4", # Cyan
|
|
44
|
+
"#3b82f6", # Blue
|
|
45
|
+
"#a855f7", # Purple
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
name: str,
|
|
51
|
+
color: Optional[str] = None,
|
|
52
|
+
tag_id: Optional[int] = None,
|
|
53
|
+
):
|
|
54
|
+
"""
|
|
55
|
+
Initialize a Tag.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
name: Tag name (will be normalized to lowercase).
|
|
59
|
+
color: Optional hex color code. Auto-assigned if None.
|
|
60
|
+
tag_id: Optional database identifier.
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
ValueError: If name is empty after stripping.
|
|
64
|
+
"""
|
|
65
|
+
normalized = name.strip().lower()
|
|
66
|
+
if not normalized:
|
|
67
|
+
raise ValueError("Tag name cannot be empty.")
|
|
68
|
+
|
|
69
|
+
self._tag_id = tag_id
|
|
70
|
+
self._name = normalized
|
|
71
|
+
# Auto-assign color based on name hash if not provided
|
|
72
|
+
self._color = color or self._auto_color()
|
|
73
|
+
|
|
74
|
+
def _auto_color(self) -> str:
|
|
75
|
+
"""Deterministically assign a color based on the tag name hash."""
|
|
76
|
+
index = hash(self._name) % len(self.DEFAULT_COLORS)
|
|
77
|
+
return self.DEFAULT_COLORS[index]
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def tag_id(self) -> Optional[int]:
|
|
81
|
+
"""Database identifier. Read-only."""
|
|
82
|
+
return self._tag_id
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def name(self) -> str:
|
|
86
|
+
"""Normalized tag name (always lowercase)."""
|
|
87
|
+
return self._name
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def color(self) -> str:
|
|
91
|
+
"""Hex color code for UI rendering."""
|
|
92
|
+
return self._color
|
|
93
|
+
|
|
94
|
+
@color.setter
|
|
95
|
+
def color(self, value: str):
|
|
96
|
+
"""Set tag color with basic hex validation."""
|
|
97
|
+
if not value.startswith("#") or len(value) not in (4, 7):
|
|
98
|
+
raise ValueError(f"Invalid hex color: {value}")
|
|
99
|
+
self._color = value
|
|
100
|
+
|
|
101
|
+
def to_dict(self) -> dict:
|
|
102
|
+
"""Serialize tag to dictionary."""
|
|
103
|
+
return {
|
|
104
|
+
"tag_id": self._tag_id,
|
|
105
|
+
"name": self._name,
|
|
106
|
+
"color": self._color,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def from_dict(cls, data: dict) -> "Tag":
|
|
111
|
+
"""Create a Tag from a dictionary."""
|
|
112
|
+
return cls(
|
|
113
|
+
name=data.get("name", ""),
|
|
114
|
+
color=data.get("color"),
|
|
115
|
+
tag_id=data.get("tag_id") or data.get("id"),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def __repr__(self) -> str:
|
|
119
|
+
return f"Tag(name='{self._name}', color='{self._color}')"
|
|
120
|
+
|
|
121
|
+
def __eq__(self, other) -> bool:
|
|
122
|
+
if isinstance(other, Tag):
|
|
123
|
+
return self._name == other._name
|
|
124
|
+
if isinstance(other, str):
|
|
125
|
+
return self._name == other.strip().lower()
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
def __hash__(self) -> int:
|
|
129
|
+
return hash(self._name)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cloudnoteslib.processors — Note Content Processors.
|
|
3
|
+
|
|
4
|
+
This package demonstrates three core OOP principles:
|
|
5
|
+
|
|
6
|
+
ABSTRACTION:
|
|
7
|
+
NoteProcessor (ABC) defines the contract that ALL processors
|
|
8
|
+
must follow. It cannot be instantiated directly.
|
|
9
|
+
|
|
10
|
+
INHERITANCE:
|
|
11
|
+
MarkdownProcessor, PlainTextProcessor, and RichTextProcessor
|
|
12
|
+
all extend NoteProcessor with their own implementations.
|
|
13
|
+
|
|
14
|
+
POLYMORPHISM:
|
|
15
|
+
All processors share the same interface (process, extract_summary,
|
|
16
|
+
get_format_type) but behave differently. The calling code doesn't
|
|
17
|
+
need to know which concrete processor is being used.
|
|
18
|
+
|
|
19
|
+
Exports:
|
|
20
|
+
NoteProcessor: Abstract base class.
|
|
21
|
+
MarkdownProcessor: Handles Markdown-formatted notes.
|
|
22
|
+
PlainTextProcessor: Handles plain text notes.
|
|
23
|
+
RichTextProcessor: Handles rich/HTML text notes.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from .base import NoteProcessor
|
|
27
|
+
from .markdown_processor import MarkdownProcessor
|
|
28
|
+
from .plaintext_processor import PlainTextProcessor
|
|
29
|
+
from .richtext_processor import RichTextProcessor
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"NoteProcessor",
|
|
33
|
+
"MarkdownProcessor",
|
|
34
|
+
"PlainTextProcessor",
|
|
35
|
+
"RichTextProcessor",
|
|
36
|
+
]
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cloudnoteslib.processors.base — Abstract Base Class for Note Processors.
|
|
3
|
+
|
|
4
|
+
Demonstrates OOP ABSTRACTION:
|
|
5
|
+
This class defines the CONTRACT that all note processors must follow.
|
|
6
|
+
It cannot be instantiated directly — only its subclasses can be used.
|
|
7
|
+
This ensures consistent behavior across all content format types.
|
|
8
|
+
|
|
9
|
+
By using ABC (Abstract Base Class), we guarantee that every processor
|
|
10
|
+
implements process(), extract_summary(), and get_format_type().
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
>>> processor = NoteProcessor() # Raises TypeError — cannot instantiate ABC
|
|
14
|
+
>>> processor = MarkdownProcessor() # OK — concrete implementation
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from typing import List
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class NoteProcessor(ABC):
|
|
22
|
+
"""
|
|
23
|
+
Abstract Base Class for note content processors.
|
|
24
|
+
|
|
25
|
+
OOP Principle — ABSTRACTION:
|
|
26
|
+
Defines the interface (contract) that all processors must implement.
|
|
27
|
+
Uses Python's abc.ABC and @abstractmethod to enforce this at the
|
|
28
|
+
language level. Any subclass that doesn't implement ALL abstract
|
|
29
|
+
methods will raise TypeError on instantiation.
|
|
30
|
+
|
|
31
|
+
Subclasses:
|
|
32
|
+
- MarkdownProcessor: Processes Markdown-formatted content
|
|
33
|
+
- PlainTextProcessor: Processes plain text content
|
|
34
|
+
- RichTextProcessor: Processes HTML/rich text content
|
|
35
|
+
|
|
36
|
+
Design Pattern — TEMPLATE METHOD:
|
|
37
|
+
The clean() method provides a common algorithm skeleton that
|
|
38
|
+
calls abstract methods. Subclasses override specific steps
|
|
39
|
+
while keeping the overall structure intact.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def process(self, content: str) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Process and clean the note content according to format rules.
|
|
46
|
+
|
|
47
|
+
This is the primary transformation method. Each processor
|
|
48
|
+
implements format-specific cleaning, normalization, and
|
|
49
|
+
validation logic.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
content: Raw note content string.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Processed and cleaned content string.
|
|
56
|
+
"""
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def extract_summary(self, content: str, max_length: int = 150) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Extract a human-readable summary from the content.
|
|
63
|
+
|
|
64
|
+
Different formats require different summarization strategies:
|
|
65
|
+
- Markdown: strip headers/formatting before extracting
|
|
66
|
+
- Plain text: take first N characters
|
|
67
|
+
- Rich text: strip HTML tags before extracting
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
content: Full note content.
|
|
71
|
+
max_length: Maximum characters for the summary.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
A clean, readable summary string.
|
|
75
|
+
"""
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def get_format_type(self) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Return the format identifier string for this processor.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Format type string (e.g., 'markdown', 'plaintext', 'richtext').
|
|
85
|
+
"""
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
@abstractmethod
|
|
89
|
+
def extract_headings(self, content: str) -> List[str]:
|
|
90
|
+
"""
|
|
91
|
+
Extract structural headings/sections from the content.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
content: Full note content.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of heading strings found in the content.
|
|
98
|
+
"""
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
# ─── Template Method (shared algorithm skeleton) ───
|
|
102
|
+
|
|
103
|
+
def clean(self, content: str) -> str:
|
|
104
|
+
"""
|
|
105
|
+
Template Method: standardized cleaning pipeline.
|
|
106
|
+
|
|
107
|
+
Applies common cleaning steps that all formats share:
|
|
108
|
+
1. Strip leading/trailing whitespace
|
|
109
|
+
2. Normalize line endings
|
|
110
|
+
3. Apply format-specific processing (delegated to subclass)
|
|
111
|
+
|
|
112
|
+
This demonstrates the TEMPLATE METHOD pattern where the
|
|
113
|
+
base class defines the algorithm structure, and subclasses
|
|
114
|
+
override specific steps.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
content: Raw note content.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Fully cleaned and processed content.
|
|
121
|
+
"""
|
|
122
|
+
# Step 1: Common normalization (shared by all formats)
|
|
123
|
+
content = content.strip()
|
|
124
|
+
content = content.replace("\r\n", "\n") # Normalize line endings
|
|
125
|
+
content = content.replace("\r", "\n")
|
|
126
|
+
|
|
127
|
+
# Step 2: Format-specific processing (delegated to subclass)
|
|
128
|
+
content = self.process(content)
|
|
129
|
+
|
|
130
|
+
return content
|
|
131
|
+
|
|
132
|
+
def get_word_count(self, content: str) -> int:
|
|
133
|
+
"""
|
|
134
|
+
Count words in content after stripping format-specific markup.
|
|
135
|
+
|
|
136
|
+
This non-abstract method provides a default implementation
|
|
137
|
+
that subclasses can optionally override for format-specific
|
|
138
|
+
word counting (e.g., excluding Markdown syntax).
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
content: Note content string.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Integer word count.
|
|
145
|
+
"""
|
|
146
|
+
cleaned = self.process(content)
|
|
147
|
+
if not cleaned.strip():
|
|
148
|
+
return 0
|
|
149
|
+
return len(cleaned.split())
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def processor_name(self) -> str:
|
|
153
|
+
"""Human-readable name of this processor."""
|
|
154
|
+
return f"{self.get_format_type().capitalize()} Processor"
|
|
155
|
+
|
|
156
|
+
def __repr__(self) -> str:
|
|
157
|
+
return f"{self.__class__.__name__}(format='{self.get_format_type()}')"
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cloudnoteslib.processors.markdown_processor — Markdown Content Processor.
|
|
3
|
+
|
|
4
|
+
Demonstrates OOP INHERITANCE and POLYMORPHISM:
|
|
5
|
+
- INHERITANCE: Extends NoteProcessor (ABC) and implements all abstract methods
|
|
6
|
+
- POLYMORPHISM: process(), extract_summary(), get_format_type() behave
|
|
7
|
+
differently from PlainTextProcessor and RichTextProcessor, but share
|
|
8
|
+
the same interface — calling code doesn't need to know which processor
|
|
9
|
+
is being used.
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
>>> processor = MarkdownProcessor()
|
|
13
|
+
>>> processor.get_format_type()
|
|
14
|
+
'markdown'
|
|
15
|
+
>>> processor.process("# Hello\\n**Bold** text")
|
|
16
|
+
'Hello Bold text'
|
|
17
|
+
>>> processor.extract_headings("# Title\\n## Section\\nContent")
|
|
18
|
+
['Title', 'Section']
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import re
|
|
22
|
+
from typing import List
|
|
23
|
+
from .base import NoteProcessor
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MarkdownProcessor(NoteProcessor):
|
|
27
|
+
"""
|
|
28
|
+
Processes Markdown-formatted note content.
|
|
29
|
+
|
|
30
|
+
Inherits from NoteProcessor and provides Markdown-specific
|
|
31
|
+
implementations for content processing, summary extraction,
|
|
32
|
+
and heading detection.
|
|
33
|
+
|
|
34
|
+
OOP Principles:
|
|
35
|
+
- INHERITANCE: Extends NoteProcessor abstract base class
|
|
36
|
+
- POLYMORPHISM: Same interface as PlainTextProcessor/RichTextProcessor
|
|
37
|
+
but with Markdown-specific behavior
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def process(self, content: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Strip Markdown formatting to produce clean plain text.
|
|
43
|
+
|
|
44
|
+
Removes headers (#), bold (**), italic (*), links, images,
|
|
45
|
+
code blocks, and other Markdown syntax while preserving
|
|
46
|
+
the actual text content.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
content: Raw Markdown content.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Clean plain text with Markdown syntax removed.
|
|
53
|
+
"""
|
|
54
|
+
if not content:
|
|
55
|
+
return ""
|
|
56
|
+
|
|
57
|
+
text = content
|
|
58
|
+
|
|
59
|
+
# Remove code blocks (``` ... ```)
|
|
60
|
+
text = re.sub(r'```[\s\S]*?```', '', text)
|
|
61
|
+
|
|
62
|
+
# Remove inline code (`...`)
|
|
63
|
+
text = re.sub(r'`([^`]+)`', r'\1', text)
|
|
64
|
+
|
|
65
|
+
# Remove images 
|
|
66
|
+
text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', text)
|
|
67
|
+
|
|
68
|
+
# Remove links [text](url) → keep text
|
|
69
|
+
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
|
70
|
+
|
|
71
|
+
# Remove headers (# ## ### etc.) → keep text
|
|
72
|
+
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
|
|
73
|
+
|
|
74
|
+
# Remove bold (**text** or __text__) → keep text
|
|
75
|
+
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
|
76
|
+
text = re.sub(r'__(.+?)__', r'\1', text)
|
|
77
|
+
|
|
78
|
+
# Remove italic (*text* or _text_) → keep text
|
|
79
|
+
text = re.sub(r'\*(.+?)\*', r'\1', text)
|
|
80
|
+
text = re.sub(r'_(.+?)_', r'\1', text)
|
|
81
|
+
|
|
82
|
+
# Remove strikethrough (~~text~~) → keep text
|
|
83
|
+
text = re.sub(r'~~(.+?)~~', r'\1', text)
|
|
84
|
+
|
|
85
|
+
# Remove blockquotes (> ...) → keep text
|
|
86
|
+
text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
|
|
87
|
+
|
|
88
|
+
# Remove horizontal rules (--- or ***)
|
|
89
|
+
text = re.sub(r'^[-*]{3,}\s*$', '', text, flags=re.MULTILINE)
|
|
90
|
+
|
|
91
|
+
# Remove list markers (- or * or 1.)
|
|
92
|
+
text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
|
|
93
|
+
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
|
|
94
|
+
|
|
95
|
+
# Collapse multiple blank lines
|
|
96
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
97
|
+
|
|
98
|
+
return text.strip()
|
|
99
|
+
|
|
100
|
+
def extract_summary(self, content: str, max_length: int = 150) -> str:
|
|
101
|
+
"""
|
|
102
|
+
Extract a summary by stripping Markdown and taking first N chars.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
content: Full Markdown content.
|
|
106
|
+
max_length: Maximum characters in summary.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Clean text summary without Markdown formatting.
|
|
110
|
+
"""
|
|
111
|
+
clean = self.process(content)
|
|
112
|
+
if len(clean) <= max_length:
|
|
113
|
+
return clean
|
|
114
|
+
# Cut at word boundary
|
|
115
|
+
truncated = clean[:max_length]
|
|
116
|
+
last_space = truncated.rfind(" ")
|
|
117
|
+
if last_space > 0:
|
|
118
|
+
truncated = truncated[:last_space]
|
|
119
|
+
return truncated + "..."
|
|
120
|
+
|
|
121
|
+
def get_format_type(self) -> str:
|
|
122
|
+
"""Return the format identifier."""
|
|
123
|
+
return "markdown"
|
|
124
|
+
|
|
125
|
+
def extract_headings(self, content: str) -> List[str]:
|
|
126
|
+
"""
|
|
127
|
+
Extract Markdown headings (lines starting with #).
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
content: Full Markdown content.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of heading text strings (without # symbols).
|
|
134
|
+
"""
|
|
135
|
+
headings = []
|
|
136
|
+
for line in content.split("\n"):
|
|
137
|
+
match = re.match(r'^(#{1,6})\s+(.+)', line.strip())
|
|
138
|
+
if match:
|
|
139
|
+
headings.append(match.group(2).strip())
|
|
140
|
+
return headings
|
|
141
|
+
|
|
142
|
+
def get_heading_structure(self, content: str) -> List[dict]:
|
|
143
|
+
"""
|
|
144
|
+
Extract headings with their hierarchy level.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of dicts: [{'level': 1, 'text': 'Title'}, ...]
|
|
148
|
+
"""
|
|
149
|
+
structure = []
|
|
150
|
+
for line in content.split("\n"):
|
|
151
|
+
match = re.match(r'^(#{1,6})\s+(.+)', line.strip())
|
|
152
|
+
if match:
|
|
153
|
+
structure.append({
|
|
154
|
+
"level": len(match.group(1)),
|
|
155
|
+
"text": match.group(2).strip(),
|
|
156
|
+
})
|
|
157
|
+
return structure
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cloudnoteslib.processors.plaintext_processor — Plain Text Processor.
|
|
3
|
+
|
|
4
|
+
INHERITANCE: Extends NoteProcessor with plain-text-specific logic.
|
|
5
|
+
POLYMORPHISM: Same interface as MarkdownProcessor/RichTextProcessor.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> processor = PlainTextProcessor()
|
|
9
|
+
>>> processor.get_format_type()
|
|
10
|
+
'plaintext'
|
|
11
|
+
>>> processor.process(" Hello World ")
|
|
12
|
+
'Hello World'
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from typing import List
|
|
17
|
+
from .base import NoteProcessor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PlainTextProcessor(NoteProcessor):
|
|
21
|
+
"""
|
|
22
|
+
Processes plain text note content.
|
|
23
|
+
|
|
24
|
+
Provides minimal processing: whitespace normalization, line
|
|
25
|
+
cleanup, and basic structural extraction.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def process(self, content: str) -> str:
|
|
29
|
+
"""
|
|
30
|
+
Normalize whitespace and clean up plain text.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
content: Raw plain text content.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Cleaned text with normalized whitespace.
|
|
37
|
+
"""
|
|
38
|
+
if not content:
|
|
39
|
+
return ""
|
|
40
|
+
|
|
41
|
+
text = content
|
|
42
|
+
|
|
43
|
+
# Normalize multiple spaces to single space
|
|
44
|
+
text = re.sub(r'[ \t]+', ' ', text)
|
|
45
|
+
|
|
46
|
+
# Collapse 3+ newlines to double newline
|
|
47
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
48
|
+
|
|
49
|
+
# Strip each line
|
|
50
|
+
lines = [line.strip() for line in text.split('\n')]
|
|
51
|
+
text = '\n'.join(lines)
|
|
52
|
+
|
|
53
|
+
return text.strip()
|
|
54
|
+
|
|
55
|
+
def extract_summary(self, content: str, max_length: int = 150) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Extract summary from first N characters of plain text.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
content: Full plain text content.
|
|
61
|
+
max_length: Maximum characters.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
First N characters, cut at word boundary.
|
|
65
|
+
"""
|
|
66
|
+
clean = self.process(content)
|
|
67
|
+
if len(clean) <= max_length:
|
|
68
|
+
return clean
|
|
69
|
+
truncated = clean[:max_length]
|
|
70
|
+
last_space = truncated.rfind(" ")
|
|
71
|
+
if last_space > 0:
|
|
72
|
+
truncated = truncated[:last_space]
|
|
73
|
+
return truncated + "..."
|
|
74
|
+
|
|
75
|
+
def get_format_type(self) -> str:
|
|
76
|
+
return "plaintext"
|
|
77
|
+
|
|
78
|
+
def extract_headings(self, content: str) -> List[str]:
|
|
79
|
+
"""
|
|
80
|
+
Infer headings from all-caps lines or lines ending with colon.
|
|
81
|
+
|
|
82
|
+
Plain text doesn't have formal heading syntax, so we use
|
|
83
|
+
heuristics: lines that are ALL CAPS or end with ':' and
|
|
84
|
+
are short enough to be titles.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
content: Full plain text content.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of inferred heading strings.
|
|
91
|
+
"""
|
|
92
|
+
headings = []
|
|
93
|
+
for line in content.split("\n"):
|
|
94
|
+
stripped = line.strip()
|
|
95
|
+
if not stripped:
|
|
96
|
+
continue
|
|
97
|
+
# All caps lines under 80 chars could be headings
|
|
98
|
+
if stripped.isupper() and len(stripped) < 80:
|
|
99
|
+
headings.append(stripped.title())
|
|
100
|
+
# Short lines ending with colon
|
|
101
|
+
elif stripped.endswith(":") and len(stripped) < 60:
|
|
102
|
+
headings.append(stripped[:-1].strip())
|
|
103
|
+
return headings
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cloudnoteslib.processors.richtext_processor — Rich/HTML Text Processor.
|
|
3
|
+
|
|
4
|
+
INHERITANCE: Third concrete subclass of NoteProcessor.
|
|
5
|
+
POLYMORPHISM: Same interface, different behavior — strips HTML tags.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> processor = RichTextProcessor()
|
|
9
|
+
>>> processor.process("<h1>Title</h1><p><b>Bold</b> text</p>")
|
|
10
|
+
'Title Bold text'
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from typing import List
|
|
15
|
+
from .base import NoteProcessor
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RichTextProcessor(NoteProcessor):
|
|
19
|
+
"""
|
|
20
|
+
Processes HTML/rich text note content.
|
|
21
|
+
|
|
22
|
+
Strips HTML tags while preserving text content. Handles common
|
|
23
|
+
HTML entities and structural elements.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def process(self, content: str) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Strip HTML tags and entities to extract plain text.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
content: Raw HTML/rich text content.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Clean text with all HTML markup removed.
|
|
35
|
+
"""
|
|
36
|
+
if not content:
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
text = content
|
|
40
|
+
|
|
41
|
+
# Replace common block-level elements with newlines
|
|
42
|
+
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
|
43
|
+
text = re.sub(r'</p>', '\n', text, flags=re.IGNORECASE)
|
|
44
|
+
text = re.sub(r'</div>', '\n', text, flags=re.IGNORECASE)
|
|
45
|
+
text = re.sub(r'</li>', '\n', text, flags=re.IGNORECASE)
|
|
46
|
+
text = re.sub(r'</h[1-6]>', '\n', text, flags=re.IGNORECASE)
|
|
47
|
+
|
|
48
|
+
# Remove all remaining HTML tags
|
|
49
|
+
text = re.sub(r'<[^>]+>', '', text)
|
|
50
|
+
|
|
51
|
+
# Decode common HTML entities
|
|
52
|
+
html_entities = {
|
|
53
|
+
'&': '&',
|
|
54
|
+
'<': '<',
|
|
55
|
+
'>': '>',
|
|
56
|
+
'"': '"',
|
|
57
|
+
''': "'",
|
|
58
|
+
' ': ' ',
|
|
59
|
+
'—': '—',
|
|
60
|
+
'–': '–',
|
|
61
|
+
'…': '...',
|
|
62
|
+
}
|
|
63
|
+
for entity, char in html_entities.items():
|
|
64
|
+
text = text.replace(entity, char)
|
|
65
|
+
|
|
66
|
+
# Remove numeric HTML entities
|
|
67
|
+
text = re.sub(r'&#\d+;', '', text)
|
|
68
|
+
|
|
69
|
+
# Normalize whitespace
|
|
70
|
+
text = re.sub(r'[ \t]+', ' ', text)
|
|
71
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
72
|
+
|
|
73
|
+
lines = [line.strip() for line in text.split('\n')]
|
|
74
|
+
text = '\n'.join(lines)
|
|
75
|
+
|
|
76
|
+
return text.strip()
|
|
77
|
+
|
|
78
|
+
def extract_summary(self, content: str, max_length: int = 150) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Extract summary by stripping HTML first, then truncating.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
content: Full HTML content.
|
|
84
|
+
max_length: Maximum characters.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Clean text summary without HTML tags.
|
|
88
|
+
"""
|
|
89
|
+
clean = self.process(content)
|
|
90
|
+
if len(clean) <= max_length:
|
|
91
|
+
return clean
|
|
92
|
+
truncated = clean[:max_length]
|
|
93
|
+
last_space = truncated.rfind(" ")
|
|
94
|
+
if last_space > 0:
|
|
95
|
+
truncated = truncated[:last_space]
|
|
96
|
+
return truncated + "..."
|
|
97
|
+
|
|
98
|
+
def get_format_type(self) -> str:
|
|
99
|
+
return "richtext"
|
|
100
|
+
|
|
101
|
+
def extract_headings(self, content: str) -> List[str]:
|
|
102
|
+
"""
|
|
103
|
+
Extract headings from HTML heading tags (h1-h6).
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
content: Full HTML content.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
List of heading text strings.
|
|
110
|
+
"""
|
|
111
|
+
headings = []
|
|
112
|
+
matches = re.findall(
|
|
113
|
+
r'<h[1-6][^>]*>(.*?)</h[1-6]>',
|
|
114
|
+
content,
|
|
115
|
+
flags=re.IGNORECASE | re.DOTALL,
|
|
116
|
+
)
|
|
117
|
+
for match in matches:
|
|
118
|
+
# Strip any nested tags inside the heading
|
|
119
|
+
clean = re.sub(r'<[^>]+>', '', match).strip()
|
|
120
|
+
if clean:
|
|
121
|
+
headings.append(clean)
|
|
122
|
+
return headings
|