openquery 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openquery-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +32 -0
- openquery-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +25 -0
- openquery-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +17 -0
- openquery-0.1.0/.github/workflows/ci.yml +56 -0
- openquery-0.1.0/.github/workflows/publish.yml +43 -0
- openquery-0.1.0/.gitignore +13 -0
- openquery-0.1.0/.pre-commit-config.yaml +17 -0
- openquery-0.1.0/.python-version +1 -0
- openquery-0.1.0/CHANGELOG.md +31 -0
- openquery-0.1.0/CONTRIBUTING.md +81 -0
- openquery-0.1.0/Dockerfile +26 -0
- openquery-0.1.0/LICENSE +21 -0
- openquery-0.1.0/PKG-INFO +285 -0
- openquery-0.1.0/README.md +238 -0
- openquery-0.1.0/SECURITY.md +37 -0
- openquery-0.1.0/docker-compose.yml +24 -0
- openquery-0.1.0/pyproject.toml +70 -0
- openquery-0.1.0/src/openquery/__init__.py +3 -0
- openquery-0.1.0/src/openquery/__main__.py +5 -0
- openquery-0.1.0/src/openquery/app.py +44 -0
- openquery-0.1.0/src/openquery/commands/__init__.py +1 -0
- openquery-0.1.0/src/openquery/commands/query.py +82 -0
- openquery-0.1.0/src/openquery/commands/serve.py +36 -0
- openquery-0.1.0/src/openquery/commands/sources.py +30 -0
- openquery-0.1.0/src/openquery/config.py +50 -0
- openquery-0.1.0/src/openquery/core/__init__.py +1 -0
- openquery-0.1.0/src/openquery/core/browser.py +120 -0
- openquery-0.1.0/src/openquery/core/cache.py +158 -0
- openquery-0.1.0/src/openquery/core/captcha.py +130 -0
- openquery-0.1.0/src/openquery/core/rate_limit.py +79 -0
- openquery-0.1.0/src/openquery/core/retry.py +54 -0
- openquery-0.1.0/src/openquery/exceptions.py +38 -0
- openquery-0.1.0/src/openquery/models/__init__.py +1 -0
- openquery-0.1.0/src/openquery/models/co/__init__.py +1 -0
- openquery-0.1.0/src/openquery/models/co/runt.py +108 -0
- openquery-0.1.0/src/openquery/models/co/simit.py +24 -0
- openquery-0.1.0/src/openquery/models/common.py +22 -0
- openquery-0.1.0/src/openquery/server/__init__.py +1 -0
- openquery-0.1.0/src/openquery/server/app.py +31 -0
- openquery-0.1.0/src/openquery/server/auth.py +33 -0
- openquery-0.1.0/src/openquery/server/deps.py +28 -0
- openquery-0.1.0/src/openquery/server/routes/__init__.py +1 -0
- openquery-0.1.0/src/openquery/server/routes/health.py +21 -0
- openquery-0.1.0/src/openquery/server/routes/query.py +108 -0
- openquery-0.1.0/src/openquery/server/routes/sources.py +29 -0
- openquery-0.1.0/src/openquery/sources/__init__.py +51 -0
- openquery-0.1.0/src/openquery/sources/base.py +73 -0
- openquery-0.1.0/src/openquery/sources/co/__init__.py +1 -0
- openquery-0.1.0/src/openquery/sources/co/runt.py +298 -0
- openquery-0.1.0/src/openquery/sources/co/simit.py +188 -0
- openquery-0.1.0/src/openquery/sources/us/__init__.py +1 -0
- openquery-0.1.0/tests/__init__.py +0 -0
- openquery-0.1.0/tests/conftest.py +29 -0
- openquery-0.1.0/tests/test_api.py +47 -0
- openquery-0.1.0/tests/test_cache.py +40 -0
- openquery-0.1.0/tests/test_captcha.py +40 -0
- openquery-0.1.0/tests/test_runt.py +137 -0
- openquery-0.1.0/tests/test_simit.py +177 -0
- openquery-0.1.0/uv.lock +1091 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug Report
|
|
3
|
+
about: Report a bug or unexpected behavior
|
|
4
|
+
title: ""
|
|
5
|
+
labels: bug
|
|
6
|
+
assignees: ""
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Description
|
|
10
|
+
|
|
11
|
+
A clear description of the bug.
|
|
12
|
+
|
|
13
|
+
## Steps to Reproduce
|
|
14
|
+
|
|
15
|
+
1. Run `openquery ...`
|
|
16
|
+
2. ...
|
|
17
|
+
3. See error
|
|
18
|
+
|
|
19
|
+
## Expected Behavior
|
|
20
|
+
|
|
21
|
+
What you expected to happen.
|
|
22
|
+
|
|
23
|
+
## Actual Behavior
|
|
24
|
+
|
|
25
|
+
What actually happened. Include error messages or output.
|
|
26
|
+
|
|
27
|
+
## Environment
|
|
28
|
+
|
|
29
|
+
- OpenQuery version: (`openquery --version`)
|
|
30
|
+
- Python version:
|
|
31
|
+
- OS:
|
|
32
|
+
- Source queried (if applicable):
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature Request
|
|
3
|
+
about: Suggest a new feature or data source
|
|
4
|
+
title: ""
|
|
5
|
+
labels: enhancement
|
|
6
|
+
assignees: ""
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Description
|
|
10
|
+
|
|
11
|
+
What feature or data source would you like?
|
|
12
|
+
|
|
13
|
+
## Use Case
|
|
14
|
+
|
|
15
|
+
Why is this useful? What problem does it solve?
|
|
16
|
+
|
|
17
|
+
## Proposed Solution
|
|
18
|
+
|
|
19
|
+
How should it work? (CLI usage, API response, etc.)
|
|
20
|
+
|
|
21
|
+
## Additional Context
|
|
22
|
+
|
|
23
|
+
- Source URL (if requesting a new data source):
|
|
24
|
+
- Country/region:
|
|
25
|
+
- Does it require CAPTCHA or authentication?
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
## Summary
|
|
2
|
+
|
|
3
|
+
Brief description of changes.
|
|
4
|
+
|
|
5
|
+
## Changes
|
|
6
|
+
|
|
7
|
+
- ...
|
|
8
|
+
|
|
9
|
+
## Testing
|
|
10
|
+
|
|
11
|
+
- [ ] Tests pass (`uv run pytest`)
|
|
12
|
+
- [ ] Linting passes (`uv run ruff check src/ tests/`)
|
|
13
|
+
- [ ] New source includes both unit and integration tests (if applicable)
|
|
14
|
+
|
|
15
|
+
## Related Issues
|
|
16
|
+
|
|
17
|
+
Closes #
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test:
|
|
14
|
+
runs-on: ${{ matrix.os }}
|
|
15
|
+
strategy:
|
|
16
|
+
fail-fast: false
|
|
17
|
+
matrix:
|
|
18
|
+
os: [ubuntu-latest, macos-latest]
|
|
19
|
+
python-version: ["3.12", "3.13"]
|
|
20
|
+
|
|
21
|
+
steps:
|
|
22
|
+
- uses: actions/checkout@v4
|
|
23
|
+
|
|
24
|
+
- name: Install uv
|
|
25
|
+
uses: astral-sh/setup-uv@v5
|
|
26
|
+
|
|
27
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
28
|
+
run: uv python install ${{ matrix.python-version }}
|
|
29
|
+
|
|
30
|
+
- name: Install dependencies
|
|
31
|
+
run: uv sync --all-extras
|
|
32
|
+
|
|
33
|
+
- name: Install system dependencies (Ubuntu)
|
|
34
|
+
if: runner.os == 'Linux'
|
|
35
|
+
run: sudo apt-get update && sudo apt-get install -y tesseract-ocr
|
|
36
|
+
|
|
37
|
+
- name: Install system dependencies (macOS)
|
|
38
|
+
if: runner.os == 'macOS'
|
|
39
|
+
run: brew install tesseract
|
|
40
|
+
|
|
41
|
+
- name: Install Playwright browsers
|
|
42
|
+
run: uv run playwright install --with-deps chromium
|
|
43
|
+
|
|
44
|
+
- name: Lint
|
|
45
|
+
run: uv run ruff check src/ tests/
|
|
46
|
+
|
|
47
|
+
- name: Test
|
|
48
|
+
run: uv run pytest --tb=short -q
|
|
49
|
+
|
|
50
|
+
docker:
|
|
51
|
+
runs-on: ubuntu-latest
|
|
52
|
+
steps:
|
|
53
|
+
- uses: actions/checkout@v4
|
|
54
|
+
|
|
55
|
+
- name: Build Docker image
|
|
56
|
+
run: docker build -t openquery:test .
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
build:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Install uv
|
|
18
|
+
uses: astral-sh/setup-uv@v5
|
|
19
|
+
|
|
20
|
+
- name: Build package
|
|
21
|
+
run: uv build
|
|
22
|
+
|
|
23
|
+
- name: Upload artifacts
|
|
24
|
+
uses: actions/upload-artifact@v4
|
|
25
|
+
with:
|
|
26
|
+
name: dist
|
|
27
|
+
path: dist/
|
|
28
|
+
|
|
29
|
+
publish:
|
|
30
|
+
needs: build
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
environment:
|
|
33
|
+
name: pypi
|
|
34
|
+
url: https://pypi.org/project/openquery/
|
|
35
|
+
steps:
|
|
36
|
+
- name: Download artifacts
|
|
37
|
+
uses: actions/download-artifact@v4
|
|
38
|
+
with:
|
|
39
|
+
name: dist
|
|
40
|
+
path: dist/
|
|
41
|
+
|
|
42
|
+
- name: Publish to PyPI
|
|
43
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-added-large-files
|
|
9
|
+
args: ["--maxkb=500"]
|
|
10
|
+
- id: check-merge-conflict
|
|
11
|
+
|
|
12
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
+
rev: v0.9.10
|
|
14
|
+
hooks:
|
|
15
|
+
- id: ruff
|
|
16
|
+
args: [--fix]
|
|
17
|
+
- id: ruff-format
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-03-31
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Core framework with `BaseSource` plugin architecture
|
|
15
|
+
- `BrowserManager` for Playwright-based scraping with WAF bypass
|
|
16
|
+
- CAPTCHA solving: `OCRSolver` (pytesseract), `TwoCaptchaSolver`, `ChainedSolver`
|
|
17
|
+
- Cache backends: in-memory (cachetools), Redis, SQLite
|
|
18
|
+
- Per-source token-bucket rate limiter
|
|
19
|
+
- Retry with exponential backoff
|
|
20
|
+
- **co.simit** — Colombian traffic fines (SIMIT) via Playwright DOM scraping
|
|
21
|
+
- **co.runt** — Colombian vehicle registry (RUNT) with CAPTCHA and Imperva WAF bypass
|
|
22
|
+
- FastAPI REST API with `/api/v1/query`, `/api/v1/sources`, `/api/v1/health`
|
|
23
|
+
- API key authentication middleware
|
|
24
|
+
- Typer CLI: `openquery query`, `openquery sources`, `openquery serve`
|
|
25
|
+
- Pydantic models for all response types
|
|
26
|
+
- Configuration via environment variables (`OPENQUERY_*`)
|
|
27
|
+
- Docker and docker-compose support with Redis
|
|
28
|
+
- 29 unit tests
|
|
29
|
+
|
|
30
|
+
[Unreleased]: https://github.com/dacrypt/openquery/compare/v0.1.0...HEAD
|
|
31
|
+
[0.1.0]: https://github.com/dacrypt/openquery/releases/tag/v0.1.0
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Contributing to OpenQuery
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in contributing! This document provides guidelines to make the process smooth.
|
|
4
|
+
|
|
5
|
+
## Development Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
git clone https://github.com/dacrypt/openquery.git
|
|
9
|
+
cd openquery
|
|
10
|
+
uv sync --all-extras
|
|
11
|
+
playwright install chromium
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Running Tests
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# Unit tests
|
|
18
|
+
uv run pytest
|
|
19
|
+
|
|
20
|
+
# With coverage
|
|
21
|
+
uv run pytest --cov=openquery
|
|
22
|
+
|
|
23
|
+
# Integration tests (hits real external services)
|
|
24
|
+
uv run pytest -m integration
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Code Quality
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# Lint
|
|
31
|
+
uv run ruff check src/ tests/
|
|
32
|
+
|
|
33
|
+
# Auto-fix
|
|
34
|
+
uv run ruff check --fix src/ tests/
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Adding a New Data Source
|
|
38
|
+
|
|
39
|
+
This is the most common type of contribution. To add a new source:
|
|
40
|
+
|
|
41
|
+
1. **Create the model** in `src/openquery/models/<country>/` with a Pydantic `BaseModel`
|
|
42
|
+
2. **Create the source** in `src/openquery/sources/<country>/` implementing `BaseSource`
|
|
43
|
+
3. **Register it** with the `@register` decorator
|
|
44
|
+
4. **Add tests** in `tests/test_<source>.py`
|
|
45
|
+
5. **Update the README** source table
|
|
46
|
+
|
|
47
|
+
See [README.md](README.md#adding-a-new-source) for a complete example.
|
|
48
|
+
|
|
49
|
+
### Source Guidelines
|
|
50
|
+
|
|
51
|
+
- Use `BrowserManager` for browser automation — don't manage Playwright directly
|
|
52
|
+
- Use `CaptchaSolver` for CAPTCHA handling — don't implement solving inline
|
|
53
|
+
- Include a `SourceMeta` with accurate `rate_limit_rpm` to be respectful to servers
|
|
54
|
+
- Return typed Pydantic models, not raw dicts
|
|
55
|
+
- Add both unit tests (mocked) and integration tests (marked with `@pytest.mark.integration`)
|
|
56
|
+
|
|
57
|
+
## Pull Requests
|
|
58
|
+
|
|
59
|
+
1. Fork the repo and create a branch from `main`
|
|
60
|
+
2. Make your changes
|
|
61
|
+
3. Ensure tests pass: `uv run pytest`
|
|
62
|
+
4. Ensure linting passes: `uv run ruff check src/ tests/`
|
|
63
|
+
5. Write a clear PR description explaining what and why
|
|
64
|
+
|
|
65
|
+
## Reporting Bugs
|
|
66
|
+
|
|
67
|
+
Open an issue with:
|
|
68
|
+
|
|
69
|
+
- Steps to reproduce
|
|
70
|
+
- Expected vs actual behavior
|
|
71
|
+
- OpenQuery version (`openquery --version`)
|
|
72
|
+
- Python version and OS
|
|
73
|
+
|
|
74
|
+
## Suggesting Sources
|
|
75
|
+
|
|
76
|
+
If you know of a useful public data source, open an issue with:
|
|
77
|
+
|
|
78
|
+
- Source name and URL
|
|
79
|
+
- What data it provides
|
|
80
|
+
- Whether it requires CAPTCHA or authentication
|
|
81
|
+
- Country/region it covers
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
# System dependencies for tesseract OCR and Playwright
|
|
4
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
5
|
+
tesseract-ocr \
|
|
6
|
+
libtesseract-dev \
|
|
7
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
8
|
+
|
|
9
|
+
# Install uv
|
|
10
|
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
|
11
|
+
|
|
12
|
+
WORKDIR /app
|
|
13
|
+
|
|
14
|
+
# Copy project files
|
|
15
|
+
COPY pyproject.toml .
|
|
16
|
+
COPY src/ src/
|
|
17
|
+
|
|
18
|
+
# Install dependencies
|
|
19
|
+
RUN uv pip install --system ".[serve]"
|
|
20
|
+
|
|
21
|
+
# Install Playwright browsers
|
|
22
|
+
RUN playwright install --with-deps chromium
|
|
23
|
+
|
|
24
|
+
EXPOSE 8000
|
|
25
|
+
|
|
26
|
+
CMD ["openquery", "serve", "--host", "0.0.0.0", "--port", "8000"]
|
openquery-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 dacrypt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
openquery-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: openquery
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Query public data sources worldwide via scraping and APIs
|
|
5
|
+
Project-URL: Homepage, https://github.com/dacrypt/openquery
|
|
6
|
+
Project-URL: Repository, https://github.com/dacrypt/openquery
|
|
7
|
+
Project-URL: Issues, https://github.com/dacrypt/openquery/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/dacrypt/openquery/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: dacrypt <dev@dacrypt.dev>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: api,captcha,government,playwright,public-data,scraping,web-scraping
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Framework :: FastAPI
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.12
|
|
25
|
+
Requires-Dist: cachetools>=5.5
|
|
26
|
+
Requires-Dist: httpx>=0.28
|
|
27
|
+
Requires-Dist: pillow>=11.0
|
|
28
|
+
Requires-Dist: playwright>=1.49
|
|
29
|
+
Requires-Dist: pydantic-settings>=2.7
|
|
30
|
+
Requires-Dist: pydantic>=2.10
|
|
31
|
+
Requires-Dist: pytesseract>=0.3.13
|
|
32
|
+
Requires-Dist: rich>=13.9
|
|
33
|
+
Requires-Dist: typer>=0.15
|
|
34
|
+
Provides-Extra: captcha
|
|
35
|
+
Requires-Dist: 2captcha-python>=1.5; extra == 'captcha'
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-httpx>=0.35; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
40
|
+
Requires-Dist: ruff>=0.9; extra == 'dev'
|
|
41
|
+
Provides-Extra: redis
|
|
42
|
+
Requires-Dist: redis[hiredis]>=5.2; extra == 'redis'
|
|
43
|
+
Provides-Extra: serve
|
|
44
|
+
Requires-Dist: fastapi>=0.115; extra == 'serve'
|
|
45
|
+
Requires-Dist: uvicorn[standard]>=0.34; extra == 'serve'
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
|
|
48
|
+
# OpenQuery
|
|
49
|
+
|
|
50
|
+
[](https://github.com/dacrypt/openquery/actions/workflows/ci.yml)
|
|
51
|
+
[](https://pypi.org/project/openquery/)
|
|
52
|
+
[](https://pypi.org/project/openquery/)
|
|
53
|
+
[](LICENSE)
|
|
54
|
+
|
|
55
|
+
Query public data sources worldwide through a unified CLI and REST API.
|
|
56
|
+
|
|
57
|
+
OpenQuery provides a plugin-based framework for scraping government websites, public registries, and open data APIs. It handles the hard parts — browser automation, CAPTCHA solving, WAF bypass, caching, and rate limiting — so you can focus on the data.
|
|
58
|
+
|
|
59
|
+
## Features
|
|
60
|
+
|
|
61
|
+
- **Unified interface** — one CLI and one API endpoint for all data sources
|
|
62
|
+
- **Browser automation** — Playwright-based scraping for JavaScript-heavy sites
|
|
63
|
+
- **CAPTCHA solving** — local OCR (pytesseract) with optional paid service fallback
|
|
64
|
+
- **WAF bypass** — browser-context API calls preserve session cookies
|
|
65
|
+
- **Caching** — in-memory, Redis, or SQLite backends with configurable TTL
|
|
66
|
+
- **Rate limiting** — per-source token-bucket to respect server limits
|
|
67
|
+
- **REST API** — FastAPI server with auto-generated OpenAPI docs
|
|
68
|
+
- **Extensible** — add new data sources by implementing a single class
|
|
69
|
+
- **Country-organized** — sources grouped by country code (`co`, `us`, etc.)
|
|
70
|
+
|
|
71
|
+
## Built-in Sources
|
|
72
|
+
|
|
73
|
+
| Source | Country | Description | Inputs | CAPTCHA |
|
|
74
|
+
|--------|---------|-------------|--------|---------|
|
|
75
|
+
| `co.simit` | CO | Traffic fines and violations | cedula, placa | No |
|
|
76
|
+
| `co.runt` | CO | National vehicle registry (SOAT, RTM, ownership) | vin, placa | Yes (OCR) |
|
|
77
|
+
|
|
78
|
+
## Installation
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install openquery
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Or with [uv](https://docs.astral.sh/uv/):
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
uv add openquery
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### System Dependencies
|
|
91
|
+
|
|
92
|
+
OpenQuery requires [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for CAPTCHA solving and Playwright browsers for web scraping:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# macOS
|
|
96
|
+
brew install tesseract
|
|
97
|
+
playwright install chromium
|
|
98
|
+
|
|
99
|
+
# Ubuntu/Debian
|
|
100
|
+
sudo apt-get install tesseract-ocr
|
|
101
|
+
playwright install --with-deps chromium
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Optional Extras
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
pip install "openquery[serve]" # FastAPI server (fastapi, uvicorn)
|
|
108
|
+
pip install "openquery[redis]" # Redis cache backend
|
|
109
|
+
pip install "openquery[captcha]" # 2captcha paid CAPTCHA solving
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Quick Start
|
|
113
|
+
|
|
114
|
+
### CLI
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
# List available data sources
|
|
118
|
+
openquery sources
|
|
119
|
+
|
|
120
|
+
# Query Colombian traffic fines by cedula
|
|
121
|
+
openquery query co.simit --cedula 12345678
|
|
122
|
+
|
|
123
|
+
# Query Colombian vehicle registry by plate
|
|
124
|
+
openquery query co.runt --placa ABC123
|
|
125
|
+
|
|
126
|
+
# Query by VIN
|
|
127
|
+
openquery query co.runt --vin 5YJ3E1EA1PF000001
|
|
128
|
+
|
|
129
|
+
# Output raw JSON
|
|
130
|
+
openquery query co.simit --cedula 12345678 --json
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### REST API
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
# Start the API server
|
|
137
|
+
openquery serve
|
|
138
|
+
|
|
139
|
+
# Or with custom host/port
|
|
140
|
+
openquery serve --host 127.0.0.1 --port 3000
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Then query via HTTP:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
curl -X POST http://localhost:8000/api/v1/query \
|
|
147
|
+
-H "Content-Type: application/json" \
|
|
148
|
+
-d '{
|
|
149
|
+
"source": "co.simit",
|
|
150
|
+
"document_type": "cedula",
|
|
151
|
+
"document_number": "12345678"
|
|
152
|
+
}'
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**Response:**
|
|
156
|
+
|
|
157
|
+
```json
|
|
158
|
+
{
|
|
159
|
+
"ok": true,
|
|
160
|
+
"source": "co.simit",
|
|
161
|
+
"queried_at": "2026-03-31T10:30:00Z",
|
|
162
|
+
"cached": false,
|
|
163
|
+
"latency_ms": 4523,
|
|
164
|
+
"data": {
|
|
165
|
+
"comparendos": 0,
|
|
166
|
+
"multas": 0,
|
|
167
|
+
"total_deuda": 0.0,
|
|
168
|
+
"paz_y_salvo": true
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
**API Endpoints:**
|
|
174
|
+
|
|
175
|
+
| Method | Path | Description |
|
|
176
|
+
|--------|------|-------------|
|
|
177
|
+
| `POST` | `/api/v1/query` | Query a data source |
|
|
178
|
+
| `GET` | `/api/v1/sources` | List available sources |
|
|
179
|
+
| `GET` | `/api/v1/health` | Health check and cache stats |
|
|
180
|
+
| `GET` | `/docs` | Interactive API documentation |
|
|
181
|
+
|
|
182
|
+
### Docker
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
docker compose up
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
This starts the API server with Redis caching on port 8000.
|
|
189
|
+
|
|
190
|
+
## Configuration
|
|
191
|
+
|
|
192
|
+
All settings use environment variables with the `OPENQUERY_` prefix:
|
|
193
|
+
|
|
194
|
+
| Variable | Default | Description |
|
|
195
|
+
|----------|---------|-------------|
|
|
196
|
+
| `OPENQUERY_API_KEY` | _(none)_ | API key for server authentication |
|
|
197
|
+
| `OPENQUERY_CACHE_BACKEND` | `memory` | Cache backend: `memory`, `redis`, `sqlite` |
|
|
198
|
+
| `OPENQUERY_CACHE_TTL_DEFAULT` | `3600` | Default cache TTL in seconds |
|
|
199
|
+
| `OPENQUERY_REDIS_URL` | `redis://localhost:6379/0` | Redis connection URL |
|
|
200
|
+
| `OPENQUERY_BROWSER_HEADLESS` | `true` | Run browser in headless mode |
|
|
201
|
+
| `OPENQUERY_BROWSER_TIMEOUT` | `30.0` | Browser operation timeout in seconds |
|
|
202
|
+
| `OPENQUERY_RATE_LIMIT_DEFAULT_RPM` | `10` | Default requests per minute per source |
|
|
203
|
+
| `OPENQUERY_CAPTCHA_SOLVER` | `ocr` | CAPTCHA solver: `ocr`, `2captcha`, `chained` |
|
|
204
|
+
| `OPENQUERY_TWO_CAPTCHA_API_KEY` | _(none)_ | 2captcha.com API key |
|
|
205
|
+
| `OPENQUERY_LOG_LEVEL` | `INFO` | Logging level |
|
|
206
|
+
|
|
207
|
+
## Adding a New Source
|
|
208
|
+
|
|
209
|
+
Create a new source by implementing the `BaseSource` class:
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
# src/openquery/sources/us/nhtsa.py
|
|
213
|
+
from pydantic import BaseModel
|
|
214
|
+
from openquery.sources import register
|
|
215
|
+
from openquery.sources.base import BaseSource, DocumentType, QueryInput, SourceMeta
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class NhtsaResult(BaseModel):
|
|
219
|
+
manufacturer: str = ""
|
|
220
|
+
model: str = ""
|
|
221
|
+
year: int = 0
|
|
222
|
+
recalls: list[dict] = []
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@register
|
|
226
|
+
class NhtsaSource(BaseSource):
|
|
227
|
+
def meta(self) -> SourceMeta:
|
|
228
|
+
return SourceMeta(
|
|
229
|
+
name="us.nhtsa",
|
|
230
|
+
display_name="NHTSA Vehicle Safety",
|
|
231
|
+
description="US vehicle safety recalls and VIN decoding",
|
|
232
|
+
country="US",
|
|
233
|
+
url="https://vpic.nhtsa.dot.gov/api/",
|
|
234
|
+
supported_inputs=[DocumentType.VIN],
|
|
235
|
+
requires_captcha=False,
|
|
236
|
+
requires_browser=False,
|
|
237
|
+
rate_limit_rpm=30,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def query(self, input: QueryInput) -> NhtsaResult:
|
|
241
|
+
import httpx
|
|
242
|
+
resp = httpx.get(
|
|
243
|
+
f"https://vpic.nhtsa.dot.gov/api/vehicles/decodevin/{input.document_number}",
|
|
244
|
+
params={"format": "json"},
|
|
245
|
+
)
|
|
246
|
+
data = resp.json()
|
|
247
|
+
# Parse and return NhtsaResult...
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
The `@register` decorator automatically makes the source available in the CLI, API, and source listing.
|
|
251
|
+
|
|
252
|
+
## Architecture
|
|
253
|
+
|
|
254
|
+
```
|
|
255
|
+
openquery/
|
|
256
|
+
├── core/ # Infrastructure (browser, captcha, cache, rate limiting)
|
|
257
|
+
├── sources/ # Data source plugins, organized by country
|
|
258
|
+
│ ├── base.py # BaseSource ABC — implement this to add sources
|
|
259
|
+
│ ├── co/ # Colombia (SIMIT, RUNT)
|
|
260
|
+
│ └── us/ # United States (future)
|
|
261
|
+
├── models/ # Pydantic response models, organized by country
|
|
262
|
+
├── server/ # FastAPI REST API
|
|
263
|
+
└── commands/ # Typer CLI commands
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Development
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
git clone https://github.com/dacrypt/openquery.git
|
|
270
|
+
cd openquery
|
|
271
|
+
uv sync --all-extras
|
|
272
|
+
playwright install chromium
|
|
273
|
+
|
|
274
|
+
# Run tests
|
|
275
|
+
uv run pytest
|
|
276
|
+
|
|
277
|
+
# Lint
|
|
278
|
+
uv run ruff check src/ tests/
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
|
|
282
|
+
|
|
283
|
+
## License
|
|
284
|
+
|
|
285
|
+
[MIT](LICENSE)
|