linkedin2md 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkedin2md/__init__.py +64 -0
- linkedin2md/__main__.py +6 -0
- linkedin2md/cli.py +81 -0
- linkedin2md/converter.py +136 -0
- linkedin2md/extractor.py +80 -0
- linkedin2md/formatter.py +1073 -0
- linkedin2md/formatters/__init__.py +133 -0
- linkedin2md/formatters/activity.py +85 -0
- linkedin2md/formatters/advertising.py +109 -0
- linkedin2md/formatters/base.py +71 -0
- linkedin2md/formatters/content.py +265 -0
- linkedin2md/formatters/identity.py +70 -0
- linkedin2md/formatters/jobs.py +173 -0
- linkedin2md/formatters/learning.py +57 -0
- linkedin2md/formatters/network.py +133 -0
- linkedin2md/formatters/payments.py +34 -0
- linkedin2md/formatters/professional.py +225 -0
- linkedin2md/formatters/profile.py +53 -0
- linkedin2md/formatters/recommendations.py +139 -0
- linkedin2md/formatters/services.py +74 -0
- linkedin2md/language.py +88 -0
- linkedin2md/parser.py +1504 -0
- linkedin2md/parsers/__init__.py +147 -0
- linkedin2md/parsers/activity.py +92 -0
- linkedin2md/parsers/advertising.py +111 -0
- linkedin2md/parsers/base.py +236 -0
- linkedin2md/parsers/content.py +269 -0
- linkedin2md/parsers/identity.py +62 -0
- linkedin2md/parsers/jobs.py +195 -0
- linkedin2md/parsers/learning.py +71 -0
- linkedin2md/parsers/network.py +162 -0
- linkedin2md/parsers/payments.py +35 -0
- linkedin2md/parsers/professional.py +269 -0
- linkedin2md/parsers/profile.py +149 -0
- linkedin2md/parsers/recommendations.py +133 -0
- linkedin2md/parsers/services.py +66 -0
- linkedin2md/protocols.py +185 -0
- linkedin2md/registry.py +76 -0
- linkedin2md/writer.py +54 -0
- linkedin2md-0.1.0.dist-info/METADATA +155 -0
- linkedin2md-0.1.0.dist-info/RECORD +44 -0
- linkedin2md-0.1.0.dist-info/WHEEL +4 -0
- linkedin2md-0.1.0.dist-info/entry_points.txt +2 -0
- linkedin2md-0.1.0.dist-info/licenses/LICENSE +21 -0
linkedin2md/__init__.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""linkedin2md - Convert LinkedIn data exports to Markdown.
|
|
2
|
+
|
|
3
|
+
SOLID-compliant architecture:
|
|
4
|
+
- S: Each parser/formatter handles one section
|
|
5
|
+
- O: New sections added via registry decorators
|
|
6
|
+
- L: All parsers/formatters are substitutable via protocols
|
|
7
|
+
- I: Focused protocols for each concern
|
|
8
|
+
- D: Converter depends on abstractions, not concretions
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
# Main public API
|
|
14
|
+
from linkedin2md.converter import LinkedInToMarkdownConverter, create_converter
|
|
15
|
+
|
|
16
|
+
# Backward compatibility - import old API
|
|
17
|
+
# (These are deprecated but kept for compatibility)
|
|
18
|
+
from linkedin2md.formatter import MarkdownFormatter
|
|
19
|
+
from linkedin2md.parser import LinkedInExportParser
|
|
20
|
+
|
|
21
|
+
# Protocols for type hints and custom implementations
|
|
22
|
+
from linkedin2md.protocols import (
|
|
23
|
+
BilingualText,
|
|
24
|
+
DataExtractor,
|
|
25
|
+
FormatterRegistry,
|
|
26
|
+
LanguageDetector,
|
|
27
|
+
OutputWriter,
|
|
28
|
+
ParserRegistry,
|
|
29
|
+
SectionFormatter,
|
|
30
|
+
SectionParser,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Registries for extension
|
|
34
|
+
from linkedin2md.registry import (
|
|
35
|
+
get_formatter_registry,
|
|
36
|
+
get_parser_registry,
|
|
37
|
+
register_formatter,
|
|
38
|
+
register_parser,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
# Version
|
|
43
|
+
"__version__",
|
|
44
|
+
# Main API
|
|
45
|
+
"LinkedInToMarkdownConverter",
|
|
46
|
+
"create_converter",
|
|
47
|
+
# Protocols
|
|
48
|
+
"BilingualText",
|
|
49
|
+
"DataExtractor",
|
|
50
|
+
"FormatterRegistry",
|
|
51
|
+
"LanguageDetector",
|
|
52
|
+
"OutputWriter",
|
|
53
|
+
"ParserRegistry",
|
|
54
|
+
"SectionFormatter",
|
|
55
|
+
"SectionParser",
|
|
56
|
+
# Registry
|
|
57
|
+
"get_formatter_registry",
|
|
58
|
+
"get_parser_registry",
|
|
59
|
+
"register_formatter",
|
|
60
|
+
"register_parser",
|
|
61
|
+
# Backward compatibility (deprecated)
|
|
62
|
+
"LinkedInExportParser",
|
|
63
|
+
"MarkdownFormatter",
|
|
64
|
+
]
|
linkedin2md/__main__.py
ADDED
linkedin2md/cli.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""CLI for linkedin2md.
|
|
2
|
+
|
|
3
|
+
Dependency Inversion: Uses factory function, doesn't create dependencies directly.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from linkedin2md.converter import create_converter
|
|
11
|
+
|
|
12
|
+
# Maximum allowed file size in megabytes (500 MB)
|
|
13
|
+
MAX_FILE_SIZE_MB = 500
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main() -> int:
|
|
17
|
+
"""Main entry point."""
|
|
18
|
+
args = _parse_args(sys.argv[1:])
|
|
19
|
+
|
|
20
|
+
if not args.source.exists():
|
|
21
|
+
print(f"Error: File not found: {args.source}", file=sys.stderr)
|
|
22
|
+
return 1
|
|
23
|
+
|
|
24
|
+
if not args.source.suffix.lower() == ".zip":
|
|
25
|
+
print(f"Error: Expected .zip file, got {args.source.suffix}", file=sys.stderr)
|
|
26
|
+
return 1
|
|
27
|
+
|
|
28
|
+
# Check file size to prevent resource exhaustion
|
|
29
|
+
file_size_mb = args.source.stat().st_size / (1024 * 1024)
|
|
30
|
+
if file_size_mb > MAX_FILE_SIZE_MB:
|
|
31
|
+
print(
|
|
32
|
+
f"Error: File too large ({file_size_mb:.1f} MB). "
|
|
33
|
+
f"Maximum allowed is {MAX_FILE_SIZE_MB} MB",
|
|
34
|
+
file=sys.stderr,
|
|
35
|
+
)
|
|
36
|
+
return 1
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
# Use factory to create converter with all dependencies
|
|
40
|
+
converter = create_converter(args.source, args.output)
|
|
41
|
+
files = converter.convert(lang=args.lang)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
44
|
+
return 1
|
|
45
|
+
|
|
46
|
+
print(f"Created {len(files)} files in {args.output}/")
|
|
47
|
+
for f in files:
|
|
48
|
+
print(f" - {f.name}")
|
|
49
|
+
|
|
50
|
+
return 0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
54
|
+
"""Parse command line arguments."""
|
|
55
|
+
parser = argparse.ArgumentParser(
|
|
56
|
+
prog="linkedin2md",
|
|
57
|
+
description="Convert LinkedIn data exports to Markdown",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"source",
|
|
61
|
+
type=Path,
|
|
62
|
+
help="LinkedIn ZIP export file",
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"-o",
|
|
66
|
+
"--output",
|
|
67
|
+
type=Path,
|
|
68
|
+
default=Path("linkedin_export"),
|
|
69
|
+
help="Output directory (default: linkedin_export)",
|
|
70
|
+
)
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--lang",
|
|
73
|
+
choices=["en", "es"],
|
|
74
|
+
default="en",
|
|
75
|
+
help="Output language (default: en)",
|
|
76
|
+
)
|
|
77
|
+
return parser.parse_args(argv)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
sys.exit(main())
|
linkedin2md/converter.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Main converter orchestrator with dependency injection.
|
|
2
|
+
|
|
3
|
+
Implements the Dependency Inversion Principle:
|
|
4
|
+
- Depends on abstractions (protocols), not concretions
|
|
5
|
+
- All dependencies are injected, not created internally
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from linkedin2md.protocols import (
|
|
11
|
+
DataExtractor,
|
|
12
|
+
FormatterRegistry,
|
|
13
|
+
OutputWriter,
|
|
14
|
+
ParserRegistry,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LinkedInToMarkdownConverter:
|
|
19
|
+
"""Main orchestrator for LinkedIn to Markdown conversion.
|
|
20
|
+
|
|
21
|
+
SOLID Principles:
|
|
22
|
+
- Single Responsibility: Only orchestrates the conversion process
|
|
23
|
+
- Open/Closed: New parsers/formatters added via registries
|
|
24
|
+
- Dependency Inversion: Depends on protocols, not implementations
|
|
25
|
+
|
|
26
|
+
All dependencies are injected via constructor.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
extractor: DataExtractor,
|
|
32
|
+
parser_registry: ParserRegistry,
|
|
33
|
+
formatter_registry: FormatterRegistry,
|
|
34
|
+
writer: OutputWriter,
|
|
35
|
+
):
|
|
36
|
+
"""Initialize with injected dependencies.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
extractor: Extracts raw data from source
|
|
40
|
+
parser_registry: Registry of section parsers
|
|
41
|
+
formatter_registry: Registry of section formatters
|
|
42
|
+
writer: Writes formatted output
|
|
43
|
+
"""
|
|
44
|
+
self._extractor = extractor
|
|
45
|
+
self._parsers = parser_registry
|
|
46
|
+
self._formatters = formatter_registry
|
|
47
|
+
self._writer = writer
|
|
48
|
+
|
|
49
|
+
def convert(self, lang: str = "en") -> list[Path]:
|
|
50
|
+
"""Convert LinkedIn export to Markdown files.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
lang: Output language ('en' or 'es')
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of paths to created files
|
|
57
|
+
"""
|
|
58
|
+
# Step 1: Extract raw CSV data
|
|
59
|
+
raw_data = self._extractor.extract()
|
|
60
|
+
|
|
61
|
+
# Step 2: Parse all sections
|
|
62
|
+
parsed_data = self._parse_all(raw_data)
|
|
63
|
+
|
|
64
|
+
# Step 3: Format and write all sections
|
|
65
|
+
return self._format_and_write_all(parsed_data, lang)
|
|
66
|
+
|
|
67
|
+
def _parse_all(self, raw_data: dict[str, list[dict]]) -> dict[str, object]:
|
|
68
|
+
"""Parse all sections using registered parsers."""
|
|
69
|
+
parsed = {}
|
|
70
|
+
|
|
71
|
+
for parser in self._parsers.get_all():
|
|
72
|
+
try:
|
|
73
|
+
result = parser.parse(raw_data)
|
|
74
|
+
parsed[parser.section_key] = result
|
|
75
|
+
except Exception as e:
|
|
76
|
+
# Log but don't fail on individual section errors
|
|
77
|
+
print(f"Warning: Failed to parse {parser.section_key}: {e}")
|
|
78
|
+
|
|
79
|
+
return parsed
|
|
80
|
+
|
|
81
|
+
def _format_and_write_all(self, data: dict[str, object], lang: str) -> list[Path]:
|
|
82
|
+
"""Format and write all sections."""
|
|
83
|
+
files = []
|
|
84
|
+
|
|
85
|
+
# Special handling for profile (needs full data)
|
|
86
|
+
profile_formatter = self._formatters.get("profile")
|
|
87
|
+
if profile_formatter:
|
|
88
|
+
content = profile_formatter.format(data, lang)
|
|
89
|
+
if content and content.strip():
|
|
90
|
+
path = self._writer.write("profile", content)
|
|
91
|
+
files.append(path)
|
|
92
|
+
|
|
93
|
+
# Format other sections
|
|
94
|
+
for formatter in self._formatters.get_all():
|
|
95
|
+
if formatter.section_key == "profile":
|
|
96
|
+
continue # Already handled
|
|
97
|
+
|
|
98
|
+
section_data = data.get(formatter.section_key)
|
|
99
|
+
if not section_data:
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
content = formatter.format(section_data, lang)
|
|
104
|
+
if content and content.strip():
|
|
105
|
+
path = self._writer.write(formatter.section_key, content)
|
|
106
|
+
files.append(path)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
print(f"Warning: Failed to format {formatter.section_key}: {e}")
|
|
109
|
+
|
|
110
|
+
return files
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def create_converter(
|
|
114
|
+
source: Path,
|
|
115
|
+
output_dir: Path,
|
|
116
|
+
) -> LinkedInToMarkdownConverter:
|
|
117
|
+
"""Factory function to create a converter with default dependencies.
|
|
118
|
+
|
|
119
|
+
This provides a convenient way to create a fully configured converter
|
|
120
|
+
while still allowing dependency injection for testing.
|
|
121
|
+
"""
|
|
122
|
+
# Import here to trigger registration of parsers and formatters
|
|
123
|
+
from linkedin2md import (
|
|
124
|
+
formatters, # noqa: F401
|
|
125
|
+
parsers, # noqa: F401
|
|
126
|
+
)
|
|
127
|
+
from linkedin2md.extractor import ZipDataExtractor
|
|
128
|
+
from linkedin2md.registry import get_formatter_registry, get_parser_registry
|
|
129
|
+
from linkedin2md.writer import MarkdownFileWriter
|
|
130
|
+
|
|
131
|
+
return LinkedInToMarkdownConverter(
|
|
132
|
+
extractor=ZipDataExtractor(source),
|
|
133
|
+
parser_registry=get_parser_registry(),
|
|
134
|
+
formatter_registry=get_formatter_registry(),
|
|
135
|
+
writer=MarkdownFileWriter(output_dir),
|
|
136
|
+
)
|
linkedin2md/extractor.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Data extraction implementations.
|
|
2
|
+
|
|
3
|
+
Single Responsibility: Extract raw data from sources.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import csv
|
|
7
|
+
import io
|
|
8
|
+
import zipfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from linkedin2md.protocols import DataExtractor
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ZipDataExtractor(DataExtractor):
|
|
15
|
+
"""Extract CSV data from LinkedIn ZIP export.
|
|
16
|
+
|
|
17
|
+
Single Responsibility: Only handles ZIP I/O and CSV parsing.
|
|
18
|
+
Does NOT transform or interpret the data.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, zip_path: Path | str):
|
|
22
|
+
self.zip_path = Path(zip_path)
|
|
23
|
+
|
|
24
|
+
def extract(self) -> dict[str, list[dict]]:
|
|
25
|
+
"""Extract all CSVs from ZIP into raw dict format.
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
ValueError: If the ZIP file is invalid or corrupted.
|
|
29
|
+
"""
|
|
30
|
+
data: dict[str, list[dict]] = {}
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
with zipfile.ZipFile(self.zip_path, "r") as zf:
|
|
34
|
+
for name in zf.namelist():
|
|
35
|
+
if name.endswith(".csv"):
|
|
36
|
+
with zf.open(name) as f:
|
|
37
|
+
content = f.read().decode("utf-8")
|
|
38
|
+
content = self._skip_header_notes(content)
|
|
39
|
+
reader = csv.DictReader(io.StringIO(content))
|
|
40
|
+
key = Path(name).stem.lower().replace(" ", "_")
|
|
41
|
+
data[key] = list(reader)
|
|
42
|
+
except zipfile.BadZipFile as err:
|
|
43
|
+
raise ValueError(f"Invalid or corrupted ZIP file: {self.zip_path}") from err
|
|
44
|
+
|
|
45
|
+
return data
|
|
46
|
+
|
|
47
|
+
def _skip_header_notes(self, content: str) -> str:
|
|
48
|
+
"""Skip header notes in LinkedIn CSVs.
|
|
49
|
+
|
|
50
|
+
Some files like Connections.csv start with:
|
|
51
|
+
Notes:
|
|
52
|
+
"When exporting your connection data..."
|
|
53
|
+
|
|
54
|
+
First Name,Last Name,URL,...
|
|
55
|
+
"""
|
|
56
|
+
lines = content.split("\n")
|
|
57
|
+
|
|
58
|
+
if lines and lines[0].strip().startswith("Notes"):
|
|
59
|
+
for i, line in enumerate(lines):
|
|
60
|
+
stripped = line.strip()
|
|
61
|
+
if not stripped:
|
|
62
|
+
continue
|
|
63
|
+
if "," in stripped and not stripped.startswith('"'):
|
|
64
|
+
return "\n".join(lines[i:])
|
|
65
|
+
|
|
66
|
+
return content
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class DictDataExtractor(DataExtractor):
|
|
70
|
+
"""Extract data from a pre-loaded dict (for testing).
|
|
71
|
+
|
|
72
|
+
Single Responsibility: Wraps existing data in extractor interface.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, data: dict[str, list[dict]]):
|
|
76
|
+
self._data = data
|
|
77
|
+
|
|
78
|
+
def extract(self) -> dict[str, list[dict]]:
|
|
79
|
+
"""Return the pre-loaded data."""
|
|
80
|
+
return self._data
|