imbed_data_prep 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. imbed_data_prep-0.1.1/.gitattributes +1 -0
  2. imbed_data_prep-0.1.1/.github/workflows/ci.yml +255 -0
  3. imbed_data_prep-0.1.1/.gitignore +115 -0
  4. imbed_data_prep-0.1.1/LICENSE +21 -0
  5. imbed_data_prep-0.1.1/PKG-INFO +41 -0
  6. imbed_data_prep-0.1.1/README.md +3 -0
  7. imbed_data_prep-0.1.1/imbed_data_prep/__init__.py +12 -0
  8. imbed_data_prep-0.1.1/imbed_data_prep/arxiv/README.md +31 -0
  9. imbed_data_prep-0.1.1/imbed_data_prep/arxiv/__init__.py +10 -0
  10. imbed_data_prep-0.1.1/imbed_data_prep/embeddings_of_aggregations/README.md +46 -0
  11. imbed_data_prep-0.1.1/imbed_data_prep/embeddings_of_aggregations/__init__.py +333 -0
  12. imbed_data_prep-0.1.1/imbed_data_prep/embeddings_of_aggregations/embeddings_and_order.ipynb +5573 -0
  13. imbed_data_prep-0.1.1/imbed_data_prep/epstein_files/README.md +182 -0
  14. imbed_data_prep-0.1.1/imbed_data_prep/epstein_files/__init__.py +1061 -0
  15. imbed_data_prep-0.1.1/imbed_data_prep/epstein_files/epstein_files.ipynb +2071 -0
  16. imbed_data_prep-0.1.1/imbed_data_prep/epstein_files/epstein_files_tables_info.json +1 -0
  17. imbed_data_prep-0.1.1/imbed_data_prep/epstein_files/epstein_files_tables_info.pickle +0 -0
  18. imbed_data_prep-0.1.1/imbed_data_prep/eurovis/README.md +52 -0
  19. imbed_data_prep-0.1.1/imbed_data_prep/eurovis/__init__.py +146 -0
  20. imbed_data_prep-0.1.1/imbed_data_prep/eurovis/eurovis.ipynb +3345 -0
  21. imbed_data_prep-0.1.1/imbed_data_prep/github_repos/README.md +45 -0
  22. imbed_data_prep-0.1.1/imbed_data_prep/github_repos/__init__.py +190 -0
  23. imbed_data_prep-0.1.1/imbed_data_prep/github_repos/github_repos.ipynb +840 -0
  24. imbed_data_prep-0.1.1/imbed_data_prep/hcp/README.md +48 -0
  25. imbed_data_prep-0.1.1/imbed_data_prep/hcp/__init__.py +253 -0
  26. imbed_data_prep-0.1.1/imbed_data_prep/hcp/hcp_analysis.ipynb +3886 -0
  27. imbed_data_prep-0.1.1/imbed_data_prep/jersey_laws/README.md +45 -0
  28. imbed_data_prep-0.1.1/imbed_data_prep/jersey_laws/__init__.py +85 -0
  29. imbed_data_prep-0.1.1/imbed_data_prep/jersey_laws/jersey_laws.ipynb +509 -0
  30. imbed_data_prep-0.1.1/imbed_data_prep/lmsys_ai_conversations/README.md +57 -0
  31. imbed_data_prep-0.1.1/imbed_data_prep/lmsys_ai_conversations/__init__.py +786 -0
  32. imbed_data_prep-0.1.1/imbed_data_prep/mcdonalds_reviews/README.md +51 -0
  33. imbed_data_prep-0.1.1/imbed_data_prep/mcdonalds_reviews/__init__.py +463 -0
  34. imbed_data_prep-0.1.1/imbed_data_prep/mcdonalds_reviews/mcdonalds_reviews_dacc.ipynb +1240 -0
  35. imbed_data_prep-0.1.1/imbed_data_prep/prompt_injections/README.md +44 -0
  36. imbed_data_prep-0.1.1/imbed_data_prep/prompt_injections/__init__.py +63 -0
  37. imbed_data_prep-0.1.1/imbed_data_prep/prompt_injections/prompt_injection_w_umap_embeddings.tsv +691 -0
  38. imbed_data_prep-0.1.1/imbed_data_prep/trump_vs_zelenskyy/README.md +60 -0
  39. imbed_data_prep-0.1.1/imbed_data_prep/trump_vs_zelenskyy/__init__.py +569 -0
  40. imbed_data_prep-0.1.1/imbed_data_prep/trump_vs_zelenskyy/trump_vs_zelensky.md +448 -0
  41. imbed_data_prep-0.1.1/imbed_data_prep/trump_vs_zelenskyy/trump_vs_zelenskyy.ipynb +3363 -0
  42. imbed_data_prep-0.1.1/imbed_data_prep/trump_vs_zelenskyy/trump_vs_zelenskyy_embeddings.parquet +0 -0
  43. imbed_data_prep-0.1.1/imbed_data_prep/trump_vs_zelenskyy/trump_vs_zelenskyy_transcript.parquet +0 -0
  44. imbed_data_prep-0.1.1/imbed_data_prep/twitter_sentiment/README.md +47 -0
  45. imbed_data_prep-0.1.1/imbed_data_prep/twitter_sentiment/__init__.py +174 -0
  46. imbed_data_prep-0.1.1/imbed_data_prep/twitter_sentiment/twitter_sentiment.ipynb +616 -0
  47. imbed_data_prep-0.1.1/imbed_data_prep/ultra_chat/README.md +37 -0
  48. imbed_data_prep-0.1.1/imbed_data_prep/ultra_chat/__init__.py +10 -0
  49. imbed_data_prep-0.1.1/imbed_data_prep/ultra_chat/ultra_chat.ipynb +229 -0
  50. imbed_data_prep-0.1.1/imbed_data_prep/wildchat/README.md +54 -0
  51. imbed_data_prep-0.1.1/imbed_data_prep/wildchat/__init__.py +265 -0
  52. imbed_data_prep-0.1.1/imbed_data_prep/wildchat/wildchat.ipynb +7787 -0
  53. imbed_data_prep-0.1.1/imbed_data_prep/wordnet_words/README.md +77 -0
  54. imbed_data_prep-0.1.1/imbed_data_prep/wordnet_words/__init__.py +1212 -0
  55. imbed_data_prep-0.1.1/imbed_data_prep/wordnet_words/test_synset_refactor.ipynb +229 -0
  56. imbed_data_prep-0.1.1/imbed_data_prep/wordnet_words/wordnet_words.ipynb +4341 -0
  57. imbed_data_prep-0.1.1/misc/REFACTOR_SUMMARY.md +181 -0
  58. imbed_data_prep-0.1.1/misc/ai_prompts.ipynb +9975 -0
  59. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Cheat Sheet for Python Machine Learning and Data Science.pdf +2498 -0
  60. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Data Science Cheat Sheets.pdf +2686 -0
  61. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Data Science Cheatsheet.pdf +2632 -0
  62. imbed_data_prep-0.1.1/misc/data/cheat_sheets/ML Cheatsheet Documentation.pdf +0 -0
  63. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Machine Learning Cheat Sheet.pdf +0 -0
  64. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Machine Learning Interview Cheat Sheets.pdf +0 -0
  65. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Machine Learning and Data Science Cheat Sheet.pdf +7 -0
  66. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Scikit-Learn Cheat Sheet for Machine Learning.pdf +0 -0
  67. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Scikit-Learn Cheat Sheet: Python Machine Learning.pdf +1 -0
  68. imbed_data_prep-0.1.1/misc/data/cheat_sheets/Scikit-Learn CheatSheet: Python Machine Learning Tutorial.pdf +0 -0
  69. imbed_data_prep-0.1.1/misc/data/cheat_sheets/The Complete Collection of Data Science Cheat Sheets.pdf +0 -0
  70. imbed_data_prep-0.1.1/misc/data/tmp.csv +1 -0
  71. imbed_data_prep-0.1.1/misc/eurovis copy 2.ipynb +3345 -0
  72. imbed_data_prep-0.1.1/misc/eurovis copy.ipynb +3345 -0
  73. imbed_data_prep-0.1.1/misc/explore_refactored_data.py +95 -0
  74. imbed_data_prep-0.1.1/misc/oa_embeddings_sentiment_models.pickle +0 -0
  75. imbed_data_prep-0.1.1/misc/quick_test_refactor.py +92 -0
  76. imbed_data_prep-0.1.1/misc/using_ai_to_get_data_descriptions.ipynb +572 -0
  77. imbed_data_prep-0.1.1/misc/various_data_preps.ipynb +2183 -0
  78. imbed_data_prep-0.1.1/pyproject.toml +183 -0
@@ -0,0 +1 @@
1
+ *.ipynb linguist-documentation
@@ -0,0 +1,255 @@
1
+ name: Continuous Integration (uv)
2
+ on: [push, pull_request]
3
+
4
+ # Workflow-level env vars from [tool.wads.ci.env] in pyproject.toml.
5
+ # Populated by wads-migrate / wads init via template substitution.
6
+ # Includes PROJECT_NAME, literal defaults from env.defaults, and secret-
7
+ # backed vars from required_envvars / test_envvars / extra_envvars,
8
+ # rendered as `KEY: secrets.KEY || ''` so missing secrets don't fail parsing.
9
+ env:
10
+ PROJECT_NAME: imbed_data_prep
11
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY || '' }}
12
+
13
+ jobs:
14
+ # First job: Read configuration from pyproject.toml
15
+ setup:
16
+ name: Read Configuration
17
+ runs-on: ubuntu-latest
18
+ outputs:
19
+ project-name: ${{ steps.config.outputs.project-name }}
20
+ python-versions: ${{ steps.config.outputs.python-versions }}
21
+ pytest-args: ${{ steps.config.outputs.pytest-args }}
22
+ coverage-enabled: ${{ steps.config.outputs.coverage-enabled }}
23
+ exclude-paths: ${{ steps.config.outputs.exclude-paths }}
24
+ test-on-windows: ${{ steps.config.outputs.test-on-windows }}
25
+ tests-enabled: ${{ steps.config.outputs.tests-enabled }}
26
+ build-sdist: ${{ steps.config.outputs.build-sdist }}
27
+ build-wheel: ${{ steps.config.outputs.build-wheel }}
28
+ metrics-enabled: ${{ steps.config.outputs.metrics-enabled }}
29
+ metrics-config-path: ${{ steps.config.outputs.metrics-config-path }}
30
+ metrics-storage-branch: ${{ steps.config.outputs.metrics-storage-branch }}
31
+ metrics-python-version: ${{ steps.config.outputs.metrics-python-version }}
32
+ metrics-force-run: ${{ steps.config.outputs.metrics-force-run }}
33
+ ruff-enabled: ${{ steps.config.outputs.ruff-enabled }}
34
+ black-enabled: ${{ steps.config.outputs.black-enabled }}
35
+ mypy-enabled: ${{ steps.config.outputs.mypy-enabled }}
36
+ docs-enabled: ${{ steps.config.outputs.docs-enabled }}
37
+ publish-enabled: ${{ steps.config.outputs.publish-enabled }}
38
+ skip-ci-marker: ${{ steps.config.outputs.skip-ci-marker }}
39
+ publish-marker: ${{ steps.config.outputs.publish-marker }}
40
+
41
+ steps:
42
+ - uses: actions/checkout@v6
43
+
44
+ - name: Set up uv
45
+ uses: astral-sh/setup-uv@v7
46
+
47
+ - name: Set up Python
48
+ run: uv python install 3.11
49
+
50
+ - name: Read CI Config
51
+ id: config
52
+ uses: i2mint/wads/actions/read-ci-config@master
53
+ with:
54
+ pyproject-path: .
55
+
56
+ # Second job: Validation using the config
57
+ validation:
58
+ name: Validation
59
+ if: "!contains(github.event.head_commit.message, '[skip ci]')"
60
+ needs: setup
61
+ runs-on: ubuntu-latest
62
+ strategy:
63
+ matrix:
64
+ python-version: ${{ fromJson(needs.setup.outputs.python-versions) }}
65
+
66
+ steps:
67
+ - uses: actions/checkout@v6
68
+
69
+ - name: Set up uv
70
+ uses: astral-sh/setup-uv@v7
71
+ with:
72
+ enable-cache: true
73
+
74
+ - name: Set up Python ${{ matrix.python-version }}
75
+ uses: i2mint/wads/actions/setup-python-uv@master
76
+ with:
77
+ python-version: ${{ matrix.python-version }}
78
+
79
+ - name: Install System Dependencies
80
+ uses: i2mint/wads/actions/install-system-deps@master
81
+ with:
82
+ pyproject-path: .
83
+
84
+ - name: Install Dependencies
85
+ uses: i2mint/wads/actions/install-deps-uv@master
86
+
87
+ - name: Format Source Code
88
+ if: needs.setup.outputs.ruff-enabled != 'false'
89
+ run: uvx ruff format .
90
+
91
+ - name: Format Source Code (black)
92
+ if: needs.setup.outputs.black-enabled == 'true'
93
+ run: uvx black .
94
+
95
+ - name: Lint Validation
96
+ if: needs.setup.outputs.ruff-enabled != 'false'
97
+ run: uvx ruff check --output-format=github ${{ needs.setup.outputs.project-name }}
98
+
99
+ - name: Type Check (mypy)
100
+ if: needs.setup.outputs.mypy-enabled == 'true'
101
+ run: uvx mypy ${{ needs.setup.outputs.project-name }}
102
+
103
+ - name: Run Tests
104
+ if: needs.setup.outputs.tests-enabled != 'false'
105
+ uses: i2mint/wads/actions/run-tests-uv@master
106
+ with:
107
+ root-dir: ${{ needs.setup.outputs.project-name }}
108
+ pytest-args: ${{ needs.setup.outputs.pytest-args }}
109
+ exclude-paths: ${{ needs.setup.outputs.exclude-paths }}
110
+ coverage: ${{ needs.setup.outputs.coverage-enabled }}
111
+
112
+ - name: Track Code Metrics
113
+ if: needs.setup.outputs.metrics-enabled == 'true'
114
+ uses: i2mint/umpyre/actions/track-metrics@master
115
+ continue-on-error: true
116
+ with:
117
+ github-token: ${{ secrets.GITHUB_TOKEN }}
118
+ config-path: ${{ needs.setup.outputs.metrics-config-path }}
119
+ storage-branch: ${{ needs.setup.outputs.metrics-storage-branch }}
120
+ python-version: ${{ needs.setup.outputs.metrics-python-version }}
121
+ force-run: ${{ needs.setup.outputs.metrics-force-run }}
122
+
123
+ # Optional Windows testing (if enabled in config)
124
+ windows-validation:
125
+ name: Windows Tests
126
+ if: "!contains(github.event.head_commit.message, '[skip ci]') && needs.setup.outputs.test-on-windows == 'true' && needs.setup.outputs.tests-enabled != 'false'"
127
+ needs: setup
128
+ runs-on: windows-latest
129
+ continue-on-error: true
130
+ env:
131
+ # PEP 540 UTF-8 mode: avoid cp1252 UnicodeDecode/EncodeError when test
132
+ # code reads source files or scripts print non-ASCII characters.
133
+ PYTHONUTF8: "1"
134
+ PYTHONIOENCODING: "utf-8"
135
+
136
+ steps:
137
+ - uses: actions/checkout@v6
138
+
139
+ - name: Set up uv
140
+ uses: astral-sh/setup-uv@v7
141
+ with:
142
+ enable-cache: true
143
+
144
+ - name: Set up Python
145
+ uses: i2mint/wads/actions/setup-python-uv@master
146
+ with:
147
+ python-version: ${{ fromJson(needs.setup.outputs.python-versions)[0] }}
148
+
149
+ - name: Install System Dependencies
150
+ uses: i2mint/wads/actions/install-system-deps@master
151
+ with:
152
+ pyproject-path: .
153
+
154
+ - name: Install Dependencies
155
+ uses: i2mint/wads/actions/install-deps-uv@master
156
+
157
+ - name: Run Tests
158
+ uses: i2mint/wads/actions/run-tests-uv@master
159
+ with:
160
+ root-dir: ${{ needs.setup.outputs.project-name }}
161
+ pytest-args: ${{ needs.setup.outputs.pytest-args }}
162
+ exclude-paths: ${{ needs.setup.outputs.exclude-paths }}
163
+
164
+ # Publishing job
165
+ #
166
+ # Gated by [tool.wads.ci.publish] in pyproject.toml (read via the setup job):
167
+ # - skip-ci-marker : when publishing is enabled, a commit message containing
168
+ # this substring skips the publish job (default "[skip ci]").
169
+ # - publish-enabled : whether publishing runs at all (default true).
170
+ # - publish-marker : when publishing is disabled, a commit message containing
171
+ # this substring forces the publish job (default "[publish]").
172
+ # The publish-enabled check uses `== 'true'` (fail-closed): if an older wads
173
+ # without these outputs is installed by read-ci-config, publishing is skipped
174
+ # rather than run unintentionally.
175
+ publish:
176
+ name: Publish
177
+ permissions:
178
+ contents: write
179
+ if: "!contains(github.event.head_commit.message, needs.setup.outputs.skip-ci-marker) && (github.ref == 'refs/heads/master' || github.ref == 'refs/heads/main') && (needs.setup.outputs.publish-enabled == 'true' || contains(github.event.head_commit.message, needs.setup.outputs.publish-marker))"
180
+ needs: [setup, validation]
181
+ runs-on: ubuntu-latest
182
+
183
+ steps:
184
+ - uses: actions/checkout@v6
185
+ with:
186
+ fetch-depth: 0
187
+ token: ${{ secrets.GITHUB_TOKEN }}
188
+
189
+ - name: Set up uv
190
+ uses: astral-sh/setup-uv@v7
191
+
192
+ - name: Set up Python
193
+ uses: i2mint/wads/actions/setup-python-uv@master
194
+ with:
195
+ python-version: ${{ fromJson(needs.setup.outputs.python-versions)[0] }}
196
+ create-venv: "false"
197
+
198
+ - name: Format Source Code
199
+ if: needs.setup.outputs.ruff-enabled != 'false'
200
+ run: uvx ruff format .
201
+
202
+ - name: Format Source Code (black)
203
+ if: needs.setup.outputs.black-enabled == 'true'
204
+ run: uvx black .
205
+
206
+ - name: Update Version Number
207
+ id: version
208
+ uses: i2mint/isee/actions/bump-version-number@master
209
+
210
+ - name: Build Distribution
211
+ uses: i2mint/wads/actions/build-dist-uv@master
212
+ with:
213
+ sdist: ${{ needs.setup.outputs.build-sdist }}
214
+ wheel: ${{ needs.setup.outputs.build-wheel }}
215
+
216
+ - name: Publish to PyPI
217
+ uses: i2mint/wads/actions/pypi-publish-uv@master
218
+ with:
219
+ pypi-token: ${{ secrets.PYPI_PASSWORD }}
220
+
221
+ - name: Force SSH for git remote
222
+ run: git remote set-url origin git@github.com:${{ github.repository }}.git
223
+
224
+ - name: Commit Changes
225
+ uses: i2mint/wads/actions/git-commit@master
226
+ with:
227
+ commit-message: "**CI** Formatted code + Updated version to ${{ env.VERSION }} [skip ci]"
228
+ ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
229
+ push: true
230
+
231
+ - name: Tag Repository
232
+ uses: i2mint/wads/actions/git-tag@master
233
+ with:
234
+ tag: ${{ env.VERSION }}
235
+ message: "Release version ${{ env.VERSION }}"
236
+ push: true
237
+
238
+ # Optional GitHub Pages (skipped when [tool.wads.ci.docs].enabled = false)
239
+ # Depends on validation (not publish) so docs still publish when the publish
240
+ # job is disabled via [tool.wads.ci.publish].enabled = false.
241
+ github-pages:
242
+ name: Publish GitHub Pages
243
+ permissions:
244
+ contents: write
245
+ pages: write
246
+ id-token: write
247
+ if: "!contains(github.event.head_commit.message, '[skip ci]') && github.ref == format('refs/heads/{0}', github.event.repository.default_branch) && needs.setup.outputs.docs-enabled != 'false'"
248
+ needs: [setup, validation]
249
+ runs-on: ubuntu-latest
250
+
251
+ steps:
252
+ - uses: i2mint/epythet/actions/publish-github-pages@master
253
+ with:
254
+ github-token: ${{ secrets.GITHUB_TOKEN }}
255
+ ignore: "tests/,scrap/,examples/"
@@ -0,0 +1,115 @@
1
+ wads_configs.json
2
+ data/wads_configs.json
3
+ wads/data/wads_configs.json
4
+
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+
11
+ .DS_Store
12
+ # C extensions
13
+ *.so
14
+
15
+ # Distribution / packaging
16
+ .Python
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+ _build
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+
66
+ # Flask stuff:
67
+ instance/
68
+ .webassets-cache
69
+
70
+ # Scrapy stuff:
71
+ .scrapy
72
+
73
+ # Sphinx documentation
74
+ docs/_build/
75
+ docs/*
76
+
77
+ # PyBuilder
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # pyenv
84
+ .python-version
85
+
86
+ # celery beat schedule file
87
+ celerybeat-schedule
88
+
89
+ # SageMath parsed files
90
+ *.sage.py
91
+
92
+ # Environments
93
+ .env
94
+ .venv
95
+ env/
96
+ venv/
97
+ ENV/
98
+ env.bak/
99
+ venv.bak/
100
+
101
+ # Spyder project settings
102
+ .spyderproject
103
+ .spyproject
104
+
105
+ # Rope project settings
106
+ .ropeproject
107
+
108
+ # mkdocs documentation
109
+ /site
110
+
111
+ # mypy
112
+ .mypy_cache/
113
+
114
+ # PyCharm
115
+ .idea
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 thorwhalen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.4
2
+ Name: imbed_data_prep
3
+ Version: 0.1.1
4
+ Summary: Modules to acquire and prepare data for the imbed package.
5
+ Project-URL: Homepage, https://github.com/thorwhalen/imbed_data_prep
6
+ Project-URL: Repository, https://github.com/thorwhalen/imbed_data_prep
7
+ Project-URL: Documentation, https://thorwhalen.github.io/imbed_data_prep
8
+ Author: thorwhalen
9
+ License: mit
10
+ License-File: LICENSE
11
+ Requires-Python: >=3.10
12
+ Requires-Dist: beautifulsoup4
13
+ Requires-Dist: datasets
14
+ Requires-Dist: dill
15
+ Requires-Dist: dol
16
+ Requires-Dist: duckdb
17
+ Requires-Dist: graze
18
+ Requires-Dist: haggle
19
+ Requires-Dist: i2
20
+ Requires-Dist: imbed
21
+ Requires-Dist: lkj
22
+ Requires-Dist: nltk
23
+ Requires-Dist: numpy
24
+ Requires-Dist: oa
25
+ Requires-Dist: pandas
26
+ Requires-Dist: requests
27
+ Requires-Dist: scikit-learn
28
+ Requires-Dist: tabled>=0.1.28
29
+ Requires-Dist: vadersentiment
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
32
+ Requires-Dist: pytest>=7.0; extra == 'dev'
33
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
34
+ Provides-Extra: docs
35
+ Requires-Dist: sphinx-rtd-theme>=1.0; extra == 'docs'
36
+ Requires-Dist: sphinx>=6.0; extra == 'docs'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # imbed_data_prep
40
+
41
+ Modules to acquire and prepare data of different datasets
@@ -0,0 +1,3 @@
1
+ # imbed_data_prep
2
+
3
+ Modules to acquire and prepare data of different datasets
@@ -0,0 +1,12 @@
1
+ """Modules to acquire and prepare data for the imbed package."""
2
+
3
+
4
+ # # Use __getitem__ to protect access of the modules of list_of_modules
5
+
6
+ # def __getitem__(name):
7
+ # # if name in list_of_modules:
8
+ # try:
9
+ # return globals()[name]
10
+ # except KeyError:
11
+ # pass # will raise ImportError below
12
+ # raise ImportError(f"No module named {name}")
@@ -0,0 +1,31 @@
1
+ # ArXiv
2
+
3
+ Data preparation for ArXiv papers.
4
+
5
+ ## Status
6
+
7
+ This module has been migrated to the standalone
8
+ [`xv`](https://pypi.org/project/xv/) package on PyPI.
9
+ The module here is a thin wrapper that re-exports from `xv`.
10
+
11
+ ## Data source
12
+
13
+ [ArXiv](https://arxiv.org/) is an open-access repository of scientific
14
+ papers in physics, mathematics, computer science, and related fields, hosted
15
+ by Cornell University.
16
+
17
+ ## Usage
18
+
19
+ ```bash
20
+ pip install xv
21
+ ```
22
+
23
+ ```python
24
+ from imbed_data_prep.arxiv import ... # delegates to xv
25
+ ```
26
+
27
+ ## Files in this directory
28
+
29
+ | File | Description |
30
+ |---|---|
31
+ | `__init__.py` | Wrapper module importing from the `xv` package |
@@ -0,0 +1,10 @@
1
+ """Access to ArXiv data.
2
+
3
+ Project moved to xv: https://pypi.org/project/xv/
4
+
5
+ """
6
+
7
+ from contextlib import suppress
8
+
9
+ with suppress(ImportError):
10
+ from xv.data_access import * # pip install xv
@@ -0,0 +1,46 @@
1
+ # Embeddings of Aggregations
2
+
3
+ Experiments with aggregated embeddings over citation graphs.
4
+
5
+ ## Data source
6
+
7
+ This module works with citation graph data and academic paper metadata.
8
+ It takes a set of nodes (papers) with embeddings and citation links, then
9
+ explores how aggregating the titles of cited papers and embedding those
10
+ aggregated strings compares to the original embeddings.
11
+
12
+ The data is expected to come from an external citation graph (e.g. Semantic
13
+ Scholar, OpenAlex, or a custom corpus) loaded as DataFrames with paper IDs,
14
+ titles, and citation edges.
15
+
16
+ ## What it does
17
+
18
+ 1. **Sample nodes** from a citation graph.
19
+ 2. **Permute citations** -- for each citing paper, generate multiple random
20
+ orderings of its cited papers.
21
+ 3. **Aggregate titles** -- concatenate cited-paper titles in each permutation
22
+ order into a single string.
23
+ 4. **Embed aggregated strings** -- compute embeddings of these concatenated
24
+ title strings.
25
+ 5. **Compare** -- measure how the aggregated embeddings relate to the
26
+ original paper embeddings, exploring whether citation context captures
27
+ similar semantic information.
28
+
29
+ ## Output
30
+
31
+ A DataFrame with columns:
32
+
33
+ | Column | Description |
34
+ |---|---|
35
+ | `citing_id` | ID of the citing paper |
36
+ | `n_cited` | Number of papers cited |
37
+ | `permutation_index` | Index of this particular citation ordering |
38
+ | `aggregated_title` | Concatenated cited-paper titles |
39
+ | `embedding` | Embedding vector of the aggregated title string |
40
+
41
+ ## Files in this directory
42
+
43
+ | File | Description |
44
+ |---|---|
45
+ | `__init__.py` | Module code |
46
+ | `embeddings_and_order.ipynb` | Notebook exploring aggregation experiments |