PyPI - fancydocx - Versions diffs - 0.1.0__tar.gz - Mend

fancydocx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

fancydocx-0.1.0/LICENSE +21 -0
fancydocx-0.1.0/PKG-INFO +90 -0
fancydocx-0.1.0/README.md +60 -0
fancydocx-0.1.0/fancydocx/__init__.py +155 -0
fancydocx-0.1.0/fancydocx/__main__.py +5 -0
fancydocx-0.1.0/fancydocx/cli.py +117 -0
fancydocx-0.1.0/fancydocx/color.py +128 -0
fancydocx-0.1.0/fancydocx/core.py +201 -0
fancydocx-0.1.0/fancydocx/fontmetrics.py +444 -0
fancydocx-0.1.0/fancydocx/numbering.py +167 -0
fancydocx-0.1.0/fancydocx/package.py +234 -0
fancydocx-0.1.0/fancydocx/render.py +1466 -0
fancydocx-0.1.0/fancydocx/styles.py +572 -0
fancydocx-0.1.0/fancydocx/theme.py +113 -0
fancydocx-0.1.0/fancydocx.egg-info/PKG-INFO +90 -0
fancydocx-0.1.0/fancydocx.egg-info/SOURCES.txt +20 -0
fancydocx-0.1.0/fancydocx.egg-info/dependency_links.txt +1 -0
fancydocx-0.1.0/fancydocx.egg-info/entry_points.txt +2 -0
fancydocx-0.1.0/fancydocx.egg-info/requires.txt +4 -0
fancydocx-0.1.0/fancydocx.egg-info/top_level.txt +1 -0
fancydocx-0.1.0/pyproject.toml +48 -0
fancydocx-0.1.0/setup.cfg +4 -0

fancydocx-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Bilal Sharif
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

fancydocx-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,90 @@
+Metadata-Version: 2.4
+Name: fancydocx
+Version: 0.1.0
+Summary: Convert fancy .docx files into a single self-contained HTML file. Pure Python, no LibreOffice.
+Author-email: Bilal Sharif <bilalwork31@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/bilalwork31-cyber/docx2html
+Project-URL: Repository, https://github.com/bilalwork31-cyber/docx2html
+Project-URL: Issues, https://github.com/bilalwork31-cyber/docx2html/issues
+Keywords: docx,html,converter,ooxml,word,resume,document,pure-python
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Topic :: Office/Business
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Provides-Extra: verify
+Requires-Dist: Pillow>=9; extra == "verify"
+Requires-Dist: numpy>=1.21; extra == "verify"
+Dynamic: license-file
+fancydocx
+=========
+Convert fancy, design-heavy Word (.docx) files into a single, self-contained
+HTML file, with all styling, images and fonts inlined. It is a from-scratch
+reader for the Office Open XML format, written in pure Python. No LibreOffice,
+no Word, and no third-party dependencies.
+Features
+--------
+- Tables, columns, shapes, text boxes and floating images
+- Theme colours, fonts and the full paragraph/run style cascade
+- Bullet and numbered lists, hyperlinks, headers and footers
+- Embedded fonts are recovered and inlined automatically
+- Batch conversion from a simple command-line tool
+Installation
+------------
+    pip install fancydocx
+Usage
+-----
+In Python:
+    import fancydocx
+    fancydocx.convert("resume.docx", "resume.html")   # write a file
+    html = fancydocx.convert("resume.docx")           # or return a string
+From the command line:
+    fancydocx resume.docx -o resume.html
+    fancydocx ./documents -o ./html --workers 8       # convert a folder
+Requirements
+------------
+Python 3.8 or newer. The library uses only the standard library.
+Notes
+-----
+- Text renders with the document's own fonts when they are installed on the
+  viewer's machine; pass --embed-fonts to inline them for portability.
+- EMF/WMF vector images are shown as placeholders, as browsers cannot
+  display them.
+License
+-------
+MIT. See the LICENSE file.

fancydocx-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,60 @@
+fancydocx
+=========
+Convert fancy, design-heavy Word (.docx) files into a single, self-contained
+HTML file, with all styling, images and fonts inlined. It is a from-scratch
+reader for the Office Open XML format, written in pure Python. No LibreOffice,
+no Word, and no third-party dependencies.
+Features
+--------
+- Tables, columns, shapes, text boxes and floating images
+- Theme colours, fonts and the full paragraph/run style cascade
+- Bullet and numbered lists, hyperlinks, headers and footers
+- Embedded fonts are recovered and inlined automatically
+- Batch conversion from a simple command-line tool
+Installation
+------------
+    pip install fancydocx
+Usage
+-----
+In Python:
+    import fancydocx
+    fancydocx.convert("resume.docx", "resume.html")   # write a file
+    html = fancydocx.convert("resume.docx")           # or return a string
+From the command line:
+    fancydocx resume.docx -o resume.html
+    fancydocx ./documents -o ./html --workers 8       # convert a folder
+Requirements
+------------
+Python 3.8 or newer. The library uses only the standard library.
+Notes
+-----
+- Text renders with the document's own fonts when they are installed on the
+  viewer's machine; pass --embed-fonts to inline them for portability.
+- EMF/WMF vector images are shown as placeholders, as browsers cannot
+  display them.
+License
+-------
+MIT. See the LICENSE file.

fancydocx-0.1.0/fancydocx/__init__.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""
+fancydocx - pure-Python DOCX -> single self-contained HTML converter.
+    import fancydocx
+    fancydocx.convert("resume.docx", "resume.html")   # write a file
+    html = fancydocx.convert("resume.docx")            # or get the HTML string
+No external engines, no LibreOffice, no network. Images are inlined as data
+URIs and embedded fonts are recovered as @font-face, so the output is one
+portable .html file.
+"""
+from __future__ import annotations
+import html as _html
+import pathlib
+from .core import local
+from .package import DocxPackage
+from .theme import Theme
+from .styles import Styles, rpr_to_css, ppr_to_css, line_height_css
+from .numbering import Numbering
+from .render import Converter
+from .fontmetrics import embed_css_for_families
+__version__ = "0.1.0"
+__all__ = ["convert", "convert_docx", "convert_file", "DocxPackage", "__version__"]
+BASE_CSS = """
+*{box-sizing:border-box}
+html,body{margin:0;padding:0}
+body{background:#e9e9ee;color:#000;-webkit-print-color-adjust:exact;print-color-adjust:exact;
+     text-rendering:geometricPrecision}
+.docx-doc{padding:24px 12px}
+/* isolation:isolate makes each page its own stacking context, so that
+   z-index:-1 layers (header/footer art, behindDoc shapes) paint ABOVE the
+   page's own background but BELOW in-flow content -- exactly Word's
+   page-color / behind-text / text layering. Without it, negative z-index
+   children fall behind the page background and vanish. */
+.docx-page{position:relative;background:#fff;margin:0 auto 24px;
+           box-shadow:0 2px 14px rgba(0,0,0,.28);overflow:hidden;isolation:isolate}
+/* .docx-body is intentionally NOT positioned so absolutely-positioned floats
+   (anchored images/shapes) resolve against the .docx-page box = true page
+   coordinates, matching Word's page-relative anchoring. */
+.docx-page p{margin:0}
+.docx-page table{border-spacing:0;max-width:none;border-collapse:collapse}
+.docx-page td,.docx-page th{vertical-align:top}
+.docx-page img{max-width:none}
+.docx-page a{color:inherit;text-decoration:inherit}
+.leader{flex:1 1 auto;align-self:flex-end;border-bottom:1px dotted currentColor;margin:0 4px 3px}
+.tab{display:inline-block;min-width:2em}
+.docx-header,.docx-footer{pointer-events:none}
+@media print{
+ html,body{background:#fff}
+ .docx-doc{padding:0}
+ .docx-page{box-shadow:none;margin:0;page-break-after:always}
+ @page{margin:0}
+}
+"""
+def _title(pkg, path):
+    core = pkg.xml("docProps/core.xml")
+    if core is not None:
+        for el in core.iter():
+            if local(el.tag) == "title" and el.text:
+                return el.text.strip()
+    return pathlib.Path(str(path)).stem
+def _body_rule(styles, theme):
+    """Default inherited run/paragraph look, applied to .docx-body."""
+    rpr = styles.effective_rpr(None, None, {})
+    ppr = styles.effective_ppr(None, {})
+    d = rpr_to_css(rpr, theme)
+    out = {}
+    for k in ("font-family", "font-size", "color"):
+        if k in d:
+            out[k] = d[k]
+    # Word single spacing is font-metric based (see fontmetrics.py); the
+    # numeric factor keeps the geometry even under font substitution.
+    out["line-height"] = line_height_css(ppr.get("spacing"),
+                                         rpr.get("font"), rpr.get("sz") or 11.0)
+    out.setdefault("font-family", "'Calibri', 'Segoe UI', sans-serif")
+    out.setdefault("font-size", "11pt")
+    out["word-wrap"] = "break-word"
+    return ".docx-body{%s}" % ";".join("%s:%s" % (k, v) for k, v in out.items())
+def convert_docx(path, include_headers=True, embed_fonts=False):
+    """
+    Convert a .docx file to a single self-contained HTML string.
+    embed_fonts: additionally inline every referenced font family found on
+    THIS machine as base64 @font-face. This makes the HTML render with the
+    exact intended glyph metrics on any viewer, at the cost of several MB
+    per file -- off by default for batch conversions.
+    """
+    pkg = DocxPackage(path)
+    try:
+        theme = Theme(pkg)
+        styles = Styles(pkg, theme)
+        numbering = Numbering(pkg, theme)
+        conv = Converter(pkg, theme, styles, numbering, include_headers=include_headers)
+        body = conv.render_document()
+        font_css, doc_families = pkg.font_face_css_and_families()
+        if embed_fonts:
+            local_css = embed_css_for_families(conv.used_fonts, already_embedded=doc_families)
+            if local_css:
+                font_css = font_css + "\n" + local_css if font_css else local_css
+        body_rule = _body_rule(styles, theme)
+        title = _title(pkg, path)
+    finally:
+        pkg.close()
+    return (
+        "<!doctype html>\n<html lang=\"en\">\n<head>\n<meta charset=\"utf-8\">\n"
+        "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n"
+        "<title>%s</title>\n<style>\n%s\n%s\n%s\n</style>\n</head>\n<body>\n"
+        "<div class=\"docx-doc\">%s</div>\n</body>\n</html>\n"
+        % (_html.escape(title), BASE_CSS, body_rule, font_css, body)
+    )
+def convert_file(in_path, out_path, include_headers=True, embed_fonts=False):
+    """Convert one .docx to one .html on disk. Returns the output path."""
+    result = convert_docx(in_path, include_headers=include_headers,
+                          embed_fonts=embed_fonts)
+    out = pathlib.Path(out_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(result, encoding="utf-8")
+    return str(out)
+def convert(source, output=None, *, embed_fonts=False, include_headers=True):
+    """
+    One-line entry point.
+        import fancydocx
+        fancydocx.convert("resume.docx", "resume.html")   # write the file, returns path
+        html = fancydocx.convert("resume.docx")            # no output -> returns HTML str
+    Parameters
+    ----------
+    source : str | os.PathLike
+        Path to the input .docx file.
+    output : str | os.PathLike | None
+        Where to write the HTML. If None, the HTML is returned as a string.
+    embed_fonts : bool
+        Inline locally-installed referenced fonts as base64 @font-face
+        (exact metrics on any viewer, at the cost of file size).
+    include_headers : bool
+        Render document headers/footers (default True).
+    """
+    if output is None:
+        return convert_docx(source, include_headers=include_headers, embed_fonts=embed_fonts)
+    return convert_file(source, output, include_headers=include_headers, embed_fonts=embed_fonts)

fancydocx-0.1.0/fancydocx/__main__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Enable ``python -m fancydocx`` to run the command-line interface."""
+from .cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

fancydocx-0.1.0/fancydocx/cli.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""
+Command-line interface for fancydocx, exposed as the ``fancydocx`` command
+(and ``python -m fancydocx``).
+  Single file:
+      fancydocx resume.docx                 -> resume.html (next to input)
+      fancydocx resume.docx -o out.html
+  Whole folder (recursive), mirroring the tree into an output dir:
+      fancydocx ./docs -o ./html
+      fancydocx ./docs -o ./html --workers 8
+"""
+from __future__ import annotations
+import argparse
+import concurrent.futures as cf
+import sys
+import time
+import traceback
+from pathlib import Path
+from . import __version__, convert_file
+def _iter_docx(root, pattern):
+    for p in sorted(Path(root).rglob(pattern)):
+        # Skip Word lock/temp files like ~$name.docx
+        if p.name.startswith("~$"):
+            continue
+        if p.is_file():
+            yield p
+def _one(in_path, out_path, include_headers, embed_fonts=False):
+    t0 = time.perf_counter()
+    try:
+        convert_file(in_path, out_path, include_headers=include_headers,
+                     embed_fonts=embed_fonts)
+        return (in_path, out_path, None, time.perf_counter() - t0)
+    except Exception as e:
+        return (in_path, out_path,
+                "".join(traceback.format_exception_only(type(e), e)).strip(),
+                time.perf_counter() - t0)
+def main(argv=None):
+    ap = argparse.ArgumentParser(
+        prog="fancydocx",
+        description="Convert fancy .docx files to a single self-contained HTML file.")
+    ap.add_argument("input", help="A .docx file or a folder containing .docx files")
+    ap.add_argument("-o", "--output", help="Output .html file (single) or output folder (batch)")
+    ap.add_argument("--glob", default="*.docx", help="Glob for batch mode (default: *.docx)")
+    ap.add_argument("--workers", type=int, default=1,
+                    help="Parallel worker processes for batch mode (default: 1)")
+    ap.add_argument("--no-headers", action="store_true", help="Skip header/footer rendering")
+    ap.add_argument("--embed-fonts", action="store_true",
+                    help="Inline locally-installed referenced fonts as @font-face "
+                         "(exact metrics everywhere, but several MB per file)")
+    ap.add_argument("--quiet", action="store_true", help="Only print a final summary")
+    ap.add_argument("--version", action="version", version="fancydocx %s" % __version__)
+    args = ap.parse_args(argv)
+    include_headers = not args.no_headers
+    inp = Path(args.input)
+    if not inp.exists():
+        ap.error("input not found: %s" % inp)
+    # ---- single file -------------------------------------------------
+    if inp.is_file():
+        out = Path(args.output) if args.output else inp.with_suffix(".html")
+        in_p, out_p, err, dt = _one(inp, out, include_headers, args.embed_fonts)
+        if err:
+            print("FAILED %s\n  %s" % (in_p, err), file=sys.stderr)
+            return 1
+        print("OK  %s -> %s  (%.2fs)" % (in_p, out_p, dt))
+        return 0
+    # ---- batch folder ------------------------------------------------
+    out_dir = Path(args.output) if args.output else inp / "_html"
+    files = list(_iter_docx(inp, args.glob))
+    if not files:
+        print("No files matching %r under %s" % (args.glob, inp))
+        return 0
+    jobs = [(f, out_dir / f.relative_to(inp).with_suffix(".html")) for f in files]
+    ok = fail = 0
+    total = len(jobs)
+    started = time.perf_counter()
+    print("Converting %d file(s) -> %s  (workers=%d)" % (total, out_dir, args.workers))
+    def report(res, i):
+        nonlocal ok, fail
+        in_p, out_p, err, dt = res
+        if err:
+            fail += 1
+            print("[%d/%d] FAILED %s\n    %s" % (i, total, in_p, err), file=sys.stderr)
+        else:
+            ok += 1
+            if not args.quiet:
+                print("[%d/%d] %s -> %s  (%.2fs)" % (i, total, in_p.name, out_p, dt))
+    if args.workers > 1:
+        with cf.ProcessPoolExecutor(max_workers=args.workers) as ex:
+            futs = {ex.submit(_one, f, o, include_headers, args.embed_fonts): idx
+                    for idx, (f, o) in enumerate(jobs, 1)}
+            for fut in cf.as_completed(futs):
+                report(fut.result(), futs[fut])
+    else:
+        for idx, (f, o) in enumerate(jobs, 1):
+            report(_one(f, o, include_headers, args.embed_fonts), idx)
+    print("\nDone: %d ok, %d failed, %d total in %.1fs"
+          % (ok, fail, total, time.perf_counter() - started))
+    return 1 if fail else 0
+if __name__ == "__main__":
+    raise SystemExit(main())

fancydocx-0.1.0/fancydocx/color.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""
+Color resolution: hex parsing, theme-color lookup, and the tint/shade
+math Office applies to themed colors.
+Word colors come in three flavors:
+    * explicit sRGB      <w:color w:val="1F4E79"/>
+    * "auto"             <w:color w:val="auto"/>  (context default)
+    * theme reference    <w:color w:themeColor="accent1" w:themeShade="BF"/>
+For theme references, `themeTint`/`themeShade` are a hex fraction of 255
+applied to the *luminance* of the resolved theme color (HSL space) -- this
+is what Office actually does, not a naive per-channel scale, so the
+accent-bar shades come out matching.
+"""
+from __future__ import annotations
+import colorsys
+# Named highlight colors (<w:highlight w:val="yellow"/>).
+HIGHLIGHT = {
+    "black": "000000", "blue": "0000FF", "cyan": "00FFFF", "darkBlue": "00008B",
+    "darkCyan": "008B8B", "darkGray": "A9A9A9", "darkGreen": "006400",
+    "darkMagenta": "8B008B", "darkRed": "8B0000", "darkYellow": "808000",
+    "green": "00FF00", "lightGray": "D3D3D3", "magenta": "FF00FF", "red": "FF0000",
+    "white": "FFFFFF", "yellow": "FFFF00",
+}
+# themeColor attribute value -> clrScheme key.  The <w:clrSchemeMapping> in
+# settings.xml can remap tx1/bg1/tx2/bg2, handled in theme.py; this is the
+# default identity mapping.
+THEME_ALIAS = {
+    "dark1": "dk1", "light1": "lt1", "dark2": "dk2", "light2": "lt2",
+    "text1": "dk1", "background1": "lt1", "text2": "dk2", "background2": "lt2",
+    "accent1": "accent1", "accent2": "accent2", "accent3": "accent3",
+    "accent4": "accent4", "accent5": "accent5", "accent6": "accent6",
+    "hyperlink": "hlink", "followedHyperlink": "folHlink",
+}
+def normalize_hex(val):
+    """Return a 6-digit uppercase hex string, or None for auto/blank/invalid."""
+    if not val:
+        return None
+    v = val.strip().lstrip("#")
+    if v.lower() == "auto":
+        return None
+    if len(v) == 3:  # rare shorthand
+        v = "".join(c * 2 for c in v)
+    if len(v) != 6:
+        return None
+    try:
+        int(v, 16)
+    except ValueError:
+        return None
+    return v.upper()
+def hex_to_rgb(h):
+    return tuple(int(h[i:i + 2], 16) for i in (0, 2, 4))
+def rgb_to_hex(rgb):
+    return "".join("%02X" % max(0, min(255, int(round(c)))) for c in rgb)
+def apply_tint_shade(hex6, tint=None, shade=None):
+    """
+    Apply themeTint / themeShade (hex byte, fraction of 255) to a base color,
+    operating on HSL luminance the way Office does.
+    """
+    if not hex6:
+        return hex6
+    r, g, b = (c / 255.0 for c in hex_to_rgb(hex6))
+    h, l, s = colorsys.rgb_to_hls(r, g, b)
+    if shade is not None:
+        try:
+            f = int(shade, 16) / 255.0
+            l = l * f
+        except ValueError:
+            pass
+    if tint is not None:
+        try:
+            f = int(tint, 16) / 255.0
+            l = l * f + (1.0 - f)
+        except ValueError:
+            pass
+    l = max(0.0, min(1.0, l))
+    r, g, b = colorsys.hls_to_rgb(h, l, s)
+    return rgb_to_hex((r * 255, g * 255, b * 255))
+def color_descriptor(el):
+    """
+    Build a color descriptor from any element carrying w:val / w:themeColor
+    (+ themeTint/themeShade).  Returns None if the element is absent.
+    """
+    if el is None:
+        return None
+    from .core import qn
+    return {
+        "val": el.get(qn("w:val")),
+        "theme": el.get(qn("w:themeColor")),
+        "tint": el.get(qn("w:themeTint")),
+        "shade": el.get(qn("w:themeShade")),
+    }
+def resolve(desc, theme, default=None):
+    """
+    Descriptor -> '#RRGGBB' (or `default` when it resolves to auto/none).
+    Precedence: when Word saves a theme-referenced color it ALSO bakes the
+    resolved sRGB into w:val (e.g. w:color w:val="9A92BF"
+    w:themeColor="accent5" w:themeTint="99"). That cached value is Word's own
+    integer-HSL math -- bit-exact by definition -- so prefer it and only
+    recompute from the theme when no explicit value exists (or it is 'auto').
+    """
+    if desc is None:
+        return default
+    hexv = normalize_hex(desc.get("val"))
+    if hexv:
+        return "#" + hexv
+    tname = desc.get("theme")
+    if tname and theme is not None:
+        base = theme.color(tname) or theme.color(THEME_ALIAS.get(tname, tname))
+        if base:
+            base = apply_tint_shade(base, desc.get("tint"), desc.get("shade"))
+            return "#" + base
+    return default