scitex 2.14.0__py3-none-any.whl → 2.15.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +71 -17
- scitex/_env_loader.py +156 -0
- scitex/_mcp_resources/__init__.py +37 -0
- scitex/_mcp_resources/_cheatsheet.py +135 -0
- scitex/_mcp_resources/_figrecipe.py +138 -0
- scitex/_mcp_resources/_formats.py +102 -0
- scitex/_mcp_resources/_modules.py +337 -0
- scitex/_mcp_resources/_session.py +149 -0
- scitex/_mcp_tools/__init__.py +4 -0
- scitex/_mcp_tools/audio.py +66 -0
- scitex/_mcp_tools/diagram.py +11 -95
- scitex/_mcp_tools/introspect.py +210 -0
- scitex/_mcp_tools/plt.py +260 -305
- scitex/_mcp_tools/scholar.py +74 -0
- scitex/_mcp_tools/social.py +27 -0
- scitex/_mcp_tools/template.py +24 -0
- scitex/_mcp_tools/writer.py +17 -210
- scitex/ai/_gen_ai/_PARAMS.py +10 -7
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
- scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
- scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
- scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
- scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
- scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
- scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
- scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
- scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +30 -1550
- scitex/ai/classification/timeseries/_sliding_window_core.py +467 -0
- scitex/ai/classification/timeseries/_sliding_window_plotting.py +369 -0
- scitex/audio/README.md +40 -36
- scitex/audio/__init__.py +129 -61
- scitex/audio/_branding.py +185 -0
- scitex/audio/_mcp/__init__.py +32 -0
- scitex/audio/_mcp/handlers.py +59 -6
- scitex/audio/_mcp/speak_handlers.py +238 -0
- scitex/audio/_relay.py +225 -0
- scitex/audio/_tts.py +18 -10
- scitex/audio/engines/base.py +17 -10
- scitex/audio/engines/elevenlabs_engine.py +7 -2
- scitex/audio/mcp_server.py +228 -75
- scitex/canvas/README.md +1 -1
- scitex/canvas/editor/_dearpygui/__init__.py +25 -0
- scitex/canvas/editor/_dearpygui/_editor.py +147 -0
- scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
- scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
- scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
- scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
- scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
- scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
- scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
- scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
- scitex/canvas/editor/_dearpygui/_selection.py +295 -0
- scitex/canvas/editor/_dearpygui/_state.py +93 -0
- scitex/canvas/editor/_dearpygui/_utils.py +61 -0
- scitex/canvas/editor/flask_editor/_core/__init__.py +27 -0
- scitex/canvas/editor/flask_editor/_core/_bbox_extraction.py +200 -0
- scitex/canvas/editor/flask_editor/_core/_editor.py +173 -0
- scitex/canvas/editor/flask_editor/_core/_export_helpers.py +353 -0
- scitex/canvas/editor/flask_editor/_core/_routes_basic.py +190 -0
- scitex/canvas/editor/flask_editor/_core/_routes_export.py +332 -0
- scitex/canvas/editor/flask_editor/_core/_routes_panels.py +252 -0
- scitex/canvas/editor/flask_editor/_core/_routes_save.py +218 -0
- scitex/canvas/editor/flask_editor/_core.py +25 -1684
- scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
- scitex/cli/__init__.py +38 -43
- scitex/cli/audio.py +160 -41
- scitex/cli/capture.py +133 -20
- scitex/cli/introspect.py +488 -0
- scitex/cli/main.py +200 -109
- scitex/cli/mcp.py +60 -34
- scitex/cli/plt.py +414 -0
- scitex/cli/repro.py +15 -8
- scitex/cli/resource.py +15 -8
- scitex/cli/scholar/__init__.py +154 -8
- scitex/cli/scholar/_crossref_scitex.py +296 -0
- scitex/cli/scholar/_fetch.py +25 -3
- scitex/cli/social.py +355 -0
- scitex/cli/stats.py +136 -11
- scitex/cli/template.py +129 -12
- scitex/cli/tex.py +15 -8
- scitex/cli/writer.py +49 -299
- scitex/cloud/__init__.py +41 -2
- scitex/config/README.md +1 -1
- scitex/config/__init__.py +16 -2
- scitex/config/_env_registry.py +256 -0
- scitex/context/__init__.py +22 -0
- scitex/dev/__init__.py +20 -1
- scitex/diagram/__init__.py +42 -19
- scitex/diagram/mcp_server.py +13 -125
- scitex/gen/__init__.py +50 -14
- scitex/gen/_list_packages.py +4 -4
- scitex/introspect/__init__.py +82 -0
- scitex/introspect/_call_graph.py +303 -0
- scitex/introspect/_class_hierarchy.py +163 -0
- scitex/introspect/_core.py +41 -0
- scitex/introspect/_docstring.py +131 -0
- scitex/introspect/_examples.py +113 -0
- scitex/introspect/_imports.py +271 -0
- scitex/{gen/_inspect_module.py → introspect/_list_api.py} +48 -56
- scitex/introspect/_mcp/__init__.py +41 -0
- scitex/introspect/_mcp/handlers.py +233 -0
- scitex/introspect/_members.py +155 -0
- scitex/introspect/_resolve.py +89 -0
- scitex/introspect/_signature.py +131 -0
- scitex/introspect/_source.py +80 -0
- scitex/introspect/_type_hints.py +172 -0
- scitex/io/_save.py +1 -2
- scitex/io/bundle/README.md +1 -1
- scitex/logging/_formatters.py +19 -9
- scitex/mcp_server.py +98 -5
- scitex/os/__init__.py +4 -0
- scitex/{gen → os}/_check_host.py +4 -5
- scitex/plt/__init__.py +245 -550
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
- scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/plt/gallery/README.md +1 -1
- scitex/plt/utils/_hitmap/__init__.py +82 -0
- scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
- scitex/plt/utils/_hitmap/_color_application.py +346 -0
- scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
- scitex/plt/utils/_hitmap/_constants.py +40 -0
- scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
- scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
- scitex/plt/utils/_hitmap/_query.py +113 -0
- scitex/plt/utils/_hitmap.py +46 -1616
- scitex/plt/utils/_metadata/__init__.py +80 -0
- scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
- scitex/plt/utils/_metadata/_artists/_base.py +195 -0
- scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
- scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
- scitex/plt/utils/_metadata/_artists/_images.py +80 -0
- scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
- scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
- scitex/plt/utils/_metadata/_artists/_text.py +106 -0
- scitex/plt/utils/_metadata/_csv.py +416 -0
- scitex/plt/utils/_metadata/_detect.py +225 -0
- scitex/plt/utils/_metadata/_legend.py +127 -0
- scitex/plt/utils/_metadata/_rounding.py +117 -0
- scitex/plt/utils/_metadata/_verification.py +202 -0
- scitex/schema/README.md +1 -1
- scitex/scholar/__init__.py +8 -0
- scitex/scholar/_mcp/crossref_handlers.py +265 -0
- scitex/scholar/core/Scholar.py +63 -1700
- scitex/scholar/core/_mixins/__init__.py +36 -0
- scitex/scholar/core/_mixins/_enrichers.py +270 -0
- scitex/scholar/core/_mixins/_library_handlers.py +100 -0
- scitex/scholar/core/_mixins/_loaders.py +103 -0
- scitex/scholar/core/_mixins/_pdf_download.py +375 -0
- scitex/scholar/core/_mixins/_pipeline.py +312 -0
- scitex/scholar/core/_mixins/_project_handlers.py +125 -0
- scitex/scholar/core/_mixins/_savers.py +69 -0
- scitex/scholar/core/_mixins/_search.py +103 -0
- scitex/scholar/core/_mixins/_services.py +88 -0
- scitex/scholar/core/_mixins/_url_finding.py +105 -0
- scitex/scholar/crossref_scitex.py +367 -0
- scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/scholar/examples/00_run_all.sh +120 -0
- scitex/scholar/jobs/_executors.py +27 -3
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
- scitex/scholar/pdf_download/_cli.py +154 -0
- scitex/scholar/pdf_download/strategies/__init__.py +11 -8
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
- scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
- scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
- scitex/scholar/pipelines/_single_steps.py +71 -36
- scitex/scholar/storage/_LibraryManager.py +97 -1695
- scitex/scholar/storage/_mixins/__init__.py +30 -0
- scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
- scitex/scholar/storage/_mixins/_library_operations.py +218 -0
- scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
- scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
- scitex/scholar/storage/_mixins/_resolution.py +376 -0
- scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
- scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
- scitex/security/README.md +3 -3
- scitex/session/README.md +1 -1
- scitex/session/__init__.py +26 -7
- scitex/session/_decorator.py +1 -1
- scitex/sh/README.md +1 -1
- scitex/sh/__init__.py +7 -4
- scitex/social/__init__.py +155 -0
- scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/stats/_mcp/_handlers/__init__.py +31 -0
- scitex/stats/_mcp/_handlers/_corrections.py +113 -0
- scitex/stats/_mcp/_handlers/_descriptive.py +78 -0
- scitex/stats/_mcp/_handlers/_effect_size.py +106 -0
- scitex/stats/_mcp/_handlers/_format.py +94 -0
- scitex/stats/_mcp/_handlers/_normality.py +110 -0
- scitex/stats/_mcp/_handlers/_posthoc.py +224 -0
- scitex/stats/_mcp/_handlers/_power.py +247 -0
- scitex/stats/_mcp/_handlers/_recommend.py +102 -0
- scitex/stats/_mcp/_handlers/_run_test.py +279 -0
- scitex/stats/_mcp/_handlers/_stars.py +48 -0
- scitex/stats/_mcp/handlers.py +19 -1171
- scitex/stats/auto/_stat_style.py +175 -0
- scitex/stats/auto/_style_definitions.py +411 -0
- scitex/stats/auto/_styles.py +22 -620
- scitex/stats/descriptive/__init__.py +11 -8
- scitex/stats/descriptive/_ci.py +39 -0
- scitex/stats/power/_power.py +15 -4
- scitex/str/__init__.py +2 -1
- scitex/str/_title_case.py +63 -0
- scitex/template/README.md +1 -1
- scitex/template/__init__.py +25 -10
- scitex/template/_code_templates.py +147 -0
- scitex/template/_mcp/handlers.py +81 -0
- scitex/template/_mcp/tool_schemas.py +55 -0
- scitex/template/_templates/__init__.py +51 -0
- scitex/template/_templates/audio.py +233 -0
- scitex/template/_templates/canvas.py +312 -0
- scitex/template/_templates/capture.py +268 -0
- scitex/template/_templates/config.py +43 -0
- scitex/template/_templates/diagram.py +294 -0
- scitex/template/_templates/io.py +107 -0
- scitex/template/_templates/module.py +53 -0
- scitex/template/_templates/plt.py +202 -0
- scitex/template/_templates/scholar.py +267 -0
- scitex/template/_templates/session.py +130 -0
- scitex/template/_templates/session_minimal.py +43 -0
- scitex/template/_templates/session_plot.py +67 -0
- scitex/template/_templates/session_stats.py +77 -0
- scitex/template/_templates/stats.py +323 -0
- scitex/template/_templates/writer.py +296 -0
- scitex/template/clone_writer_directory.py +5 -5
- scitex/ui/_backends/_email.py +10 -2
- scitex/ui/_backends/_webhook.py +5 -1
- scitex/web/_search_pubmed.py +10 -6
- scitex/writer/README.md +1 -1
- scitex/writer/__init__.py +43 -34
- scitex/writer/_mcp/handlers.py +11 -744
- scitex/writer/_mcp/tool_schemas.py +5 -335
- scitex-2.15.3.dist-info/METADATA +667 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/RECORD +241 -120
- scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
- scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
- scitex/diagram/_compile.py +0 -312
- scitex/diagram/_diagram.py +0 -355
- scitex/diagram/_mcp/__init__.py +0 -4
- scitex/diagram/_mcp/handlers.py +0 -400
- scitex/diagram/_mcp/tool_schemas.py +0 -157
- scitex/diagram/_presets.py +0 -173
- scitex/diagram/_schema.py +0 -182
- scitex/diagram/_split.py +0 -278
- scitex/gen/_ci.py +0 -12
- scitex/gen/_title_case.py +0 -89
- scitex/plt/_mcp/__init__.py +0 -4
- scitex/plt/_mcp/_handlers_annotation.py +0 -102
- scitex/plt/_mcp/_handlers_figure.py +0 -195
- scitex/plt/_mcp/_handlers_plot.py +0 -252
- scitex/plt/_mcp/_handlers_style.py +0 -219
- scitex/plt/_mcp/handlers.py +0 -74
- scitex/plt/_mcp/tool_schemas.py +0 -497
- scitex/plt/mcp_server.py +0 -231
- scitex/scholar/examples/SUGGESTIONS.md +0 -865
- scitex/scholar/examples/dev.py +0 -38
- scitex-2.14.0.dist-info/METADATA +0 -1238
- /scitex/{gen → context}/_detect_environment.py +0 -0
- /scitex/{gen → context}/_get_notebook_path.py +0 -0
- /scitex/{gen/_shell.py → sh/_shell_legacy.py} +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/WHEEL +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,865 +0,0 @@
|
|
|
1
|
-
<!-- ---
|
|
2
|
-
!-- Timestamp: 2025-08-22 05:28:16
|
|
3
|
-
!-- Author: ywatanabe
|
|
4
|
-
!-- File: /home/ywatanabe/proj/SciTeX-Code/src/scitex/scholar/examples/SUGGESTIONS.md
|
|
5
|
-
!-- --- -->
|
|
6
|
-
|
|
7
|
-
You've hit another excellent and distinct debugging case. The error has changed, which tells us we are making progress and uncovering the complexities of running these translators outside their native environment.
|
|
8
|
-
|
|
9
|
-
The new key error is:
|
|
10
|
-
ERROR: Translator error: em.doWeb is not a function
|
|
11
|
-
|
|
12
|
-
This is different and more informative than the previous one. It's not just a data format issue; it's a structural problem with how the translator code is being executed.
|
|
13
|
-
|
|
14
|
-
## What's Happening Here?
|
|
15
|
-
Zotero translators have evolved, and not all of them follow the simple, global detectWeb() and doWeb() function pattern. The "Frontiers" translator uses a more modern, object-oriented approach.
|
|
16
|
-
|
|
17
|
-
When the Frontiers.js code is executed, it doesn't just define global functions. Instead, it likely creates an instance of an object (let's call it em).
|
|
18
|
-
|
|
19
|
-
The main logic, including the doWeb function, is a method of this em object.
|
|
20
|
-
|
|
21
|
-
Your current JavaScript wrapper and executor (zotero_wrapper.js) are not aware of this. They blindly look for a global doWeb function, which doesn't exist. The function they need to call is em.doWeb().
|
|
22
|
-
|
|
23
|
-
Because this first step fails, the script execution halts, and the subsequent SyntaxError: Unexpected token '<' is likely a red herring—a downstream consequence of the initial failure.
|
|
24
|
-
|
|
25
|
-
## The Solution: A More Intelligent Executor
|
|
26
|
-
To solve this, your JavaScript executor needs to be smarter. It must first run the translator's code and then inspect the global scope to find out how that specific translator needs to be called. It needs to check for a global doWeb function, and if that doesn't exist, it should look for a common object pattern (like an object named em with a doWeb method).
|
|
27
|
-
|
|
28
|
-
Here is an updated, more robust version of the execution logic for your _ZoteroTranslatorRunner.py. This change centralizes the execution logic within the page.evaluate call, making it more adaptable.
|
|
29
|
-
|
|
30
|
-
Replace the page.evaluate block in the extract_urls_pdf_async method of _ZoteroTranslatorRunner.py with this:
|
|
31
|
-
|
|
32
|
-
Python
|
|
33
|
-
|
|
34
|
-
# In _ZoteroTranslatorRunner.py -> extract_urls_pdf_async method
|
|
35
|
-
|
|
36
|
-
# ... after injecting the Zotero environment JS ...
|
|
37
|
-
|
|
38
|
-
result = await page.evaluate(
|
|
39
|
-
"""
|
|
40
|
-
async ([translatorCode, translatorLabel]) => {
|
|
41
|
-
// This is the main execution function, now running entirely in the browser.
|
|
42
|
-
// It's a combination of your wrapper and the execution logic.
|
|
43
|
-
const urls = new Set();
|
|
44
|
-
const items = [];
|
|
45
|
-
let translatorError = null;
|
|
46
|
-
|
|
47
|
-
// --- Start: Mock Zotero Environment (condensed from your zotero_wrapper.js) ---
|
|
48
|
-
window.Zotero = {
|
|
49
|
-
Item: function(type) {
|
|
50
|
-
this.itemType = type;
|
|
51
|
-
this.attachments = [];
|
|
52
|
-
this.url = null;
|
|
53
|
-
this.DOI = null;
|
|
54
|
-
this.complete = function() {
|
|
55
|
-
if (this.url) urls.add(this.url);
|
|
56
|
-
if (this.DOI) urls.add('https://doi.org/' + this.DOI);
|
|
57
|
-
this.attachments.forEach(att => {
|
|
58
|
-
if (att.url && (att.mimeType === 'application/pdf' || att.url.includes('.pdf'))) {
|
|
59
|
-
urls.add(att.url);
|
|
60
|
-
}
|
|
61
|
-
});
|
|
62
|
-
items.push(this);
|
|
63
|
-
};
|
|
64
|
-
},
|
|
65
|
-
loadTranslator: function() { return { setTranslator: function() {}, setString: function(s) {}, setHandler: function(e,h){}, translate: function() {} }; },
|
|
66
|
-
Utilities: {
|
|
67
|
-
xpath: (doc, xpath) => {
|
|
68
|
-
const result = document.evaluate(xpath, doc, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
|
|
69
|
-
const nodes = [];
|
|
70
|
-
let node;
|
|
71
|
-
while (node = result.iterateNext()) {
|
|
72
|
-
nodes.push(node);
|
|
73
|
-
}
|
|
74
|
-
return nodes;
|
|
75
|
-
},
|
|
76
|
-
xpathText: (doc, xpath) => document.evaluate(xpath, doc, null, XPathResult.STRING_TYPE, null).stringValue,
|
|
77
|
-
trimInternal: str => str ? str.trim().replace(/\\s+/g, ' ') : '',
|
|
78
|
-
},
|
|
79
|
-
debug: (msg) => console.log(`[Zotero Debug] ${msg}`),
|
|
80
|
-
done: () => console.log("Zotero.done() called."),
|
|
81
|
-
};
|
|
82
|
-
window.Z = window.Zotero;
|
|
83
|
-
window.ZU = window.Zotero.Utilities;
|
|
84
|
-
window.requestDocument = async (url) => document; // Simplified for this context
|
|
85
|
-
window.requestText = async (url, options) => {
|
|
86
|
-
if (url.startsWith('/')) url = window.location.origin + url;
|
|
87
|
-
try {
|
|
88
|
-
const response = await fetch(url, { credentials: 'include', ...(options || {}) });
|
|
89
|
-
const contentType = response.headers.get("content-type");
|
|
90
|
-
const text = await response.text();
|
|
91
|
-
if (contentType && contentType.includes("text/html")) {
|
|
92
|
-
console.error("requestText received HTML instead of data for URL:", url);
|
|
93
|
-
return null;
|
|
94
|
-
}
|
|
95
|
-
return text;
|
|
96
|
-
} catch (e) {
|
|
97
|
-
console.error("requestText failed for URL:", url, e);
|
|
98
|
-
return null;
|
|
99
|
-
}
|
|
100
|
-
};
|
|
101
|
-
// --- End: Mock Zotero Environment ---
|
|
102
|
-
|
|
103
|
-
try {
|
|
104
|
-
// Step 1: Execute the translator code to define its functions/objects
|
|
105
|
-
eval(translatorCode);
|
|
106
|
-
|
|
107
|
-
// Step 2: INTELLIGENTLY FIND AND CALL THE CORRECT doWeb function
|
|
108
|
-
let detected = false;
|
|
109
|
-
let doWebFunction = null;
|
|
110
|
-
let contextObject = window; // Assume global context by default
|
|
111
|
-
|
|
112
|
-
// Pattern 1: Standard global functions
|
|
113
|
-
if (typeof detectWeb === 'function' && typeof doWeb === 'function') {
|
|
114
|
-
if (detectWeb(document, window.location.href)) {
|
|
115
|
-
detected = true;
|
|
116
|
-
doWebFunction = doWeb;
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
// Pattern 2: Object-oriented pattern (like 'em.doWeb')
|
|
120
|
-
else if (typeof em === 'object' && typeof em.detectWeb === 'function' && typeof em.doWeb === 'function') {
|
|
121
|
-
if (em.detectWeb(document, window.location.href)) {
|
|
122
|
-
detected = true;
|
|
123
|
-
doWebFunction = em.doWeb;
|
|
124
|
-
contextObject = em; // The function must be called on the 'em' object
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
// Add other patterns here if you discover them
|
|
128
|
-
|
|
129
|
-
// Step 3: Execute if a valid pattern was found
|
|
130
|
-
if (detected && doWebFunction) {
|
|
131
|
-
console.log(`Executing translator with pattern on context: ${contextObject}`);
|
|
132
|
-
await doWebFunction.call(contextObject, document, window.location.href);
|
|
133
|
-
// Give async operations a moment to complete
|
|
134
|
-
await new Promise(resolve => setTimeout(resolve, 500));
|
|
135
|
-
} else {
|
|
136
|
-
console.log("Translator pattern not recognized or detectWeb returned false.");
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
} catch (e) {
|
|
140
|
-
translatorError = e.message;
|
|
141
|
-
console.error('Translator execution error:', e);
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
return {
|
|
145
|
-
success: !translatorError && urls.size > 0,
|
|
146
|
-
translator: translatorLabel,
|
|
147
|
-
urls: Array.from(urls),
|
|
148
|
-
itemCount: items.length,
|
|
149
|
-
error: translatorError
|
|
150
|
-
};
|
|
151
|
-
}
|
|
152
|
-
""",
|
|
153
|
-
[translator["code"], translator["label"]],
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
## Why This Fix Works
|
|
157
|
-
Self-Contained: It moves the entire mock environment and execution logic into the page.evaluate block. This is cleaner and ensures the environment is perfectly set up right before the translator runs.
|
|
158
|
-
|
|
159
|
-
Pattern Detection: The core of the new logic is the "INTELLIGENTLY FIND AND CALL" section. It no longer assumes a single structure.
|
|
160
|
-
|
|
161
|
-
It first checks for the "classic" global detectWeb/doWeb functions.
|
|
162
|
-
|
|
163
|
-
If that fails, it checks for a common object-oriented pattern: an object named em that contains the detectWeb and doWeb methods.
|
|
164
|
-
|
|
165
|
-
Correct this Context: When it finds the object-oriented pattern, it uses doWebFunction.call(contextObject, ...) to ensure that when doWeb is executed, the this keyword inside it correctly refers to the em object. This is what solves the em.doWeb is not a function error.
|
|
166
|
-
|
|
167
|
-
Extensible: You can easily add more else if blocks to this logic if you encounter other translator patterns in the future.
|
|
168
|
-
|
|
169
|
-
By making your runner smarter, you can handle the diversity of Zotero translators and significantly improve the reliability of your PDF extraction process.
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
is you approach versatile for any publisher pages?
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
Show thinking
|
|
182
|
-
That's an excellent question. The short answer is yes, this approach is incredibly versatile and is likely the most robust and scalable method you can build for this purpose.
|
|
183
|
-
|
|
184
|
-
However, it is not a "silver bullet" that will work flawlessly on every single publisher's website without occasional adjustments. Think of it as a powerful framework that handles 90% of the problem, but you'll still need to debug the tricky 10%.
|
|
185
|
-
|
|
186
|
-
Here’s a breakdown of its versatility and its limitations.
|
|
187
|
-
|
|
188
|
-
## Why This Approach Is So Powerful 🚀
|
|
189
|
-
Leverages Massive Community Effort: You are tapping into the Zotero open-source community, which has spent years writing and maintaining thousands of translators for individual publisher sites, journals, databases, and news outlets. You don't have to write a custom scraper for Elsevier, another for Springer, another for Nature, etc. The community has already done that hard work.
|
|
190
|
-
|
|
191
|
-
Handles Complex Scraping Logic: Zotero translators are far more sophisticated than simple CSS selectors. They contain logic to:
|
|
192
|
-
|
|
193
|
-
Try multiple methods to find metadata.
|
|
194
|
-
|
|
195
|
-
Parse hidden JSON data embedded in the page.
|
|
196
|
-
|
|
197
|
-
Automate clicking "Export" buttons and fetching citation files (like RIS or BibTeX) in the background.
|
|
198
|
-
|
|
199
|
-
Clean up messy data (e.g., formatting author names).
|
|
200
|
-
|
|
201
|
-
Standardized Interface: As we've discovered, the translators have a standardized structure (the JSON header with a target regex and the detectWeb/doWeb entry points). This allows your Python runner to treat them like plugins, dynamically loading and executing the correct one for any given URL.
|
|
202
|
-
|
|
203
|
-
## The Inevitable Limitations and Challenges 🛠️
|
|
204
|
-
While versatile, it's not universally plug-and-play. The errors you've already encountered are perfect examples of the types of issues you will run into.
|
|
205
|
-
|
|
206
|
-
Incomplete Mock Environment: Your zotero_wrapper.js is a mock of the real Zotero environment. We have implemented the common functions (Zotero.Item, requestText, etc.), but a translator for a niche website might call a less common utility function (e.g., ZU.cleanISBN()) that doesn't exist in your mock, causing an error.
|
|
207
|
-
|
|
208
|
-
Solution: This is an iterative process. When a translator fails because of a missing function, you add a simple implementation of it to your wrapper.
|
|
209
|
-
|
|
210
|
-
Aggressive Anti-Scraping Measures: Publishers actively try to block automated access. You may encounter:
|
|
211
|
-
|
|
212
|
-
CAPTCHAs: If your script makes too many requests too quickly, a site like Google Scholar or Scopus will throw up a CAPTCHA, which will halt the process.
|
|
213
|
-
|
|
214
|
-
Dynamic JavaScript Challenges: Some sites may use JavaScript techniques to detect and block automated browsers like Playwright.
|
|
215
|
-
|
|
216
|
-
Complex Authentication: This is the most common failure point. Even if your main page is logged in, a translator's background fetch request might fail because:
|
|
217
|
-
|
|
218
|
-
It's blocked by Cross-Origin Resource Sharing (CORS) policies.
|
|
219
|
-
|
|
220
|
-
It doesn't carry the necessary authentication tokens or cookies that the main page uses.
|
|
221
|
-
|
|
222
|
-
This is exactly what happened with your ScienceDirect attempt, which returned a login page (HTML) instead of data.
|
|
223
|
-
|
|
224
|
-
Outdated or Buggy Translators: Sometimes, a publisher will update their website, and the corresponding Zotero translator will be temporarily broken until a community member fixes it. You might be the first to discover the issue.
|
|
225
|
-
|
|
226
|
-
## Conclusion
|
|
227
|
-
Your approach of using a Python/Playwright runner to execute Zotero translators is the gold standard for this kind of task. It is far more versatile and maintainable than trying to write and manage hundreds of custom scrapers yourself.
|
|
228
|
-
|
|
229
|
-
The key is to think of it not as a single script you write once, but as a robust framework that you will occasionally need to debug and enhance as you encounter new publisher websites and new translator patterns.
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
#!/usr/bin/env python3
|
|
233
|
-
# -*- coding: utf-8 -*-
|
|
234
|
-
# File: test_zotero_runner.py
|
|
235
|
-
# ----------------------------------------
|
|
236
|
-
"""
|
|
237
|
-
Test Suite for ZoteroTranslatorRunner.
|
|
238
|
-
|
|
239
|
-
This script uses pytest to validate that the ZoteroTranslatorRunner can
|
|
240
|
-
correctly execute translators from various publishers with different internal
|
|
241
|
-
architectures (e.g., global functions vs. object-oriented methods).
|
|
242
|
-
|
|
243
|
-
Usage:
|
|
244
|
-
1. Make sure you have pytest and pytest-asyncio installed:
|
|
245
|
-
pip install pytest pytest-asyncio playwright
|
|
246
|
-
playwright install
|
|
247
|
-
2. Place this file in a 'tests' directory alongside your project source.
|
|
248
|
-
3. Make sure your _ZoteroTranslatorRunner.py is importable.
|
|
249
|
-
4. Run the tests from your project's root directory:
|
|
250
|
-
pytest
|
|
251
|
-
"""
|
|
252
|
-
|
|
253
|
-
import asyncio
|
|
254
|
-
import pytest
|
|
255
|
-
from playwright.async_api import Page, Browser, async_playwright
|
|
256
|
-
|
|
257
|
-
# Note: ZoteroTranslatorRunner is now deprecated. Use the Python-based zotero_translators instead.
|
|
258
|
-
# Old location (deprecated): from scitex.scholar.url.helpers.finders._ZoteroTranslatorRunner import ZoteroTranslatorRunner
|
|
259
|
-
# New location: Use scitex.scholar.url.strategies.zotero_translators for Python-based implementation
|
|
260
|
-
|
|
261
|
-
# --- Test Cases ---
|
|
262
|
-
# A curated list of URLs that represent different publishers and translator patterns.
|
|
263
|
-
# Each tuple contains: (test_id, url)
|
|
264
|
-
TEST_CASES = [
|
|
265
|
-
# Pattern: Object-oriented ('em.doWeb'). Test for the fix from the previous issue.
|
|
266
|
-
("frontiers", "https://www.frontiersin.org/journals/neuroscience/articles/10.3389/fnins.2019.00573/full"),
|
|
267
|
-
|
|
268
|
-
# Pattern: Standard global functions. A very common and reliable open-access source.
|
|
269
|
-
("arxiv", "https://arxiv.org/abs/2103.14030"),
|
|
270
|
-
|
|
271
|
-
# Pattern: Major publisher (Nature). Often has complex pages.
|
|
272
|
-
("nature", "https://www.nature.com/articles/s41586-021-03372-6"),
|
|
273
|
-
|
|
274
|
-
# Pattern: Another major publisher (Elsevier). This is an open-access article.
|
|
275
|
-
("sciencedirect_openaccess", "https://www.sciencedirect.com/science/article/pii/S009286742030120X"),
|
|
276
|
-
|
|
277
|
-
# Pattern: Major publisher (Wiley).
|
|
278
|
-
("wiley", "https://onlinelibrary.wiley.com/doi/full/10.1111/j.1365-2966.2006.10766.x"),
|
|
279
|
-
|
|
280
|
-
# Case: A URL that should NOT match any translator. Tests the graceful failure case.
|
|
281
|
-
("no_match", "https://www.google.com"),
|
|
282
|
-
]
|
|
283
|
-
|
|
284
|
-
# --- Pytest Fixtures ---
|
|
285
|
-
|
|
286
|
-
@pytest.fixture(scope="session")
|
|
287
|
-
def event_loop():
|
|
288
|
-
"""Create an instance of the default event loop for the session."""
|
|
289
|
-
loop = asyncio.get_event_loop_policy().new_event_loop()
|
|
290
|
-
yield loop
|
|
291
|
-
loop.close()
|
|
292
|
-
|
|
293
|
-
@pytest.fixture(scope="session")
|
|
294
|
-
def runner() -> ZoteroTranslatorRunner:
|
|
295
|
-
"""Provides a single instance of the ZoteroTranslatorRunner for all tests."""
|
|
296
|
-
print("\n--- Initializing ZoteroTranslatorRunner ---")
|
|
297
|
-
return ZoteroTranslatorRunner()
|
|
298
|
-
|
|
299
|
-
@pytest.fixture(scope="session")
|
|
300
|
-
async def browser() -> Browser:
|
|
301
|
-
"""Launches a single browser instance for the entire test session."""
|
|
302
|
-
async with async_playwright() as p:
|
|
303
|
-
print("\n--- Launching Browser ---")
|
|
304
|
-
browser_instance = await p.chromium.launch(headless=True)
|
|
305
|
-
yield browser_instance
|
|
306
|
-
print("\n--- Closing Browser ---")
|
|
307
|
-
await browser_instance.close()
|
|
308
|
-
|
|
309
|
-
@pytest.fixture
|
|
310
|
-
async def page(browser: Browser) -> Page:
|
|
311
|
-
"""Creates a new page for each test case."""
|
|
312
|
-
page = await browser.new_page()
|
|
313
|
-
yield page
|
|
314
|
-
await page.close()
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
# --- Test Functions ---
|
|
318
|
-
|
|
319
|
-
@pytest.mark.parametrize("test_id, url", TEST_CASES)
|
|
320
|
-
async def test_translator_execution(runner: ZoteroTranslatorRunner, page: Page, test_id: str, url: str):
|
|
321
|
-
"""
|
|
322
|
-
Tests the Zotero runner against a specific URL.
|
|
323
|
-
|
|
324
|
-
Args:
|
|
325
|
-
runner: The ZoteroTranslatorRunner instance.
|
|
326
|
-
page: The Playwright page object for the test.
|
|
327
|
-
test_id: A friendly name for the test case.
|
|
328
|
-
url: The URL to test the translator against.
|
|
329
|
-
"""
|
|
330
|
-
print(f"\n[Testing: {test_id}] Navigating to: {url}")
|
|
331
|
-
|
|
332
|
-
try:
|
|
333
|
-
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
334
|
-
# Add a small delay for dynamically loaded content to appear
|
|
335
|
-
await page.wait_for_timeout(3000)
|
|
336
|
-
except Exception as e:
|
|
337
|
-
pytest.fail(f"Failed to navigate to {url}: {e}")
|
|
338
|
-
|
|
339
|
-
# Execute the translator runner
|
|
340
|
-
pdf_urls = await runner.extract_urls_pdf_async(page)
|
|
341
|
-
|
|
342
|
-
# Assert the expected outcome
|
|
343
|
-
if test_id == "no_match":
|
|
344
|
-
assert pdf_urls is not None, "Runner should not crash on no match"
|
|
345
|
-
assert len(pdf_urls) == 0, f"Expected no URLs for {test_id}, but found {len(pdf_urls)}"
|
|
346
|
-
print(f"✅ [SUCCESS: {test_id}] Correctly found no matching translator.")
|
|
347
|
-
else:
|
|
348
|
-
assert pdf_urls is not None, "Runner should return a list, not None"
|
|
349
|
-
assert len(pdf_urls) > 0, f"Expected at least one PDF URL for {test_id}, but found none."
|
|
350
|
-
|
|
351
|
-
first_url = pdf_urls[0]
|
|
352
|
-
assert isinstance(first_url, str), "Result should be a list of strings"
|
|
353
|
-
assert "pdf" in first_url.lower(), f"Expected 'pdf' in the URL, but got: {first_url}"
|
|
354
|
-
|
|
355
|
-
print(f"✅ [SUCCESS: {test_id}] Found {len(pdf_urls)} PDF URL(s). First URL: {first_url}")
|
|
356
|
-
|
|
357
|
-
# To run this script directly for demonstration purposes
|
|
358
|
-
if __name__ == "__main__":
|
|
359
|
-
print("This is a pytest script. To run the tests, execute 'pytest' in your terminal.")
|
|
360
|
-
print("Example Test Cases:")
|
|
361
|
-
for test_id, url in TEST_CASES:
|
|
362
|
-
print(f"- {test_id.capitalize()}: {url}")
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
#!/usr/bin/env python3
|
|
366
|
-
# -*- coding: utf-8 -*-
|
|
367
|
-
# File: test_zotero_runner.py
|
|
368
|
-
# ----------------------------------------
|
|
369
|
-
"""
|
|
370
|
-
Test Suite for ZoteroTranslatorRunner.
|
|
371
|
-
|
|
372
|
-
This script uses pytest to validate that the ZoteroTranslatorRunner can
|
|
373
|
-
correctly execute translators from various publishers with different internal
|
|
374
|
-
architectures (e.g., global functions vs. object-oriented methods).
|
|
375
|
-
|
|
376
|
-
Usage:
|
|
377
|
-
1. Make sure you have pytest and pytest-asyncio installed:
|
|
378
|
-
pip install pytest pytest-asyncio playwright
|
|
379
|
-
playwright install
|
|
380
|
-
2. Place this file in a 'tests' directory alongside your project source.
|
|
381
|
-
3. Make sure your _ZoteroTranslatorRunner.py is importable.
|
|
382
|
-
4. Run the tests from your project's root directory:
|
|
383
|
-
pytest
|
|
384
|
-
"""
|
|
385
|
-
|
|
386
|
-
import asyncio
|
|
387
|
-
import pytest
|
|
388
|
-
from playwright.async_api import Page, Browser, async_playwright
|
|
389
|
-
|
|
390
|
-
# Note: ZoteroTranslatorRunner is now deprecated. Use the Python-based zotero_translators instead.
|
|
391
|
-
# Old location (deprecated): from scitex.scholar.url.helpers.finders._ZoteroTranslatorRunner import ZoteroTranslatorRunner
|
|
392
|
-
# New location: Use scitex.scholar.url.strategies.zotero_translators for Python-based implementation
|
|
393
|
-
|
|
394
|
-
# --- Test Cases ---
|
|
395
|
-
# A curated list of URLs that represent different publishers and translator patterns.
|
|
396
|
-
# Each tuple contains: (test_id, url)
|
|
397
|
-
TEST_CASES = [
|
|
398
|
-
# --- Tier 1: Core Academic Publishers & Platforms ---
|
|
399
|
-
("frontiers", "https://www.frontiersin.org/journals/neuroscience/articles/10.3389/fnins.2019.00573/full"),
|
|
400
|
-
("arxiv", "https://arxiv.org/abs/2103.14030"),
|
|
401
|
-
("nature", "https://www.nature.com/articles/s41586-021-03372-6"),
|
|
402
|
-
("sciencedirect_openaccess", "https://www.sciencedirect.com/science/article/pii/S009286742030120X"),
|
|
403
|
-
("wiley", "https://onlinelibrary.wiley.com/doi/full/10.1111/j.1365-2966.2006.10766.x"),
|
|
404
|
-
("taylor_and_francis", "https://www.tandfonline.com/doi/full/10.1080/09540261.2021.1897716"),
|
|
405
|
-
("sage", "https://journals.sagepub.com/doi/full/10.1177/0002716220987979"),
|
|
406
|
-
("springer", "https://link.springer.com/article/10.1007/s00221-021-06089-y"),
|
|
407
|
-
("pubmed", "https://pubmed.ncbi.nlm.nih.gov/33758224/"),
|
|
408
|
-
("jstor", "https://www.jstor.org/stable/10.5325/jafrireli.14.1.0001"),
|
|
409
|
-
("acm_digital_library", "https://dl.acm.org/doi/10.1145/3411764.3445623"),
|
|
410
|
-
("acs_publications", "https://pubs.acs.org/doi/10.1021/jacs.0c12533"),
|
|
411
|
-
("ieee_xplore", "https://ieeexplore.ieee.org/document/9349473"),
|
|
412
|
-
("biomed_central", "https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02262-1"),
|
|
413
|
-
("plos", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0248220"),
|
|
414
|
-
("project_muse", "https://muse.jhu.edu/article/786236"),
|
|
415
|
-
("biorxiv_highwire", "https://www.biorxiv.org/content/10.1101/2021.03.11.434931v1"),
|
|
416
|
-
|
|
417
|
-
# --- Tier 2: Expanded Academic & Research Sources ---
|
|
418
|
-
("oxford_university_press", "https://academic.oup.com/mnras/article/372/2/643/1067311"),
|
|
419
|
-
("cambridge_core", "https://www.cambridge.org/core/journals/international-organization/article/abs/power-and-plenty-in-the-first-millennium-ad/A55A51C6A531918118A51A25A39D6582"),
|
|
420
|
-
("google_scholar", "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=deep+learning+for+natural+language+processing&btnG="),
|
|
421
|
-
("semantic_scholar", "https://www.semanticscholar.org/paper/Attention-Is-All-You-Need-Vaswani-Shazeer/204e3073870fae3d05bcbc2f6a8e263d9b72e776"),
|
|
422
|
-
("researchgate", "https://www.researchgate.net/publication/322459231_BERT_Pre-training_of_Deep_Bidirectional_Transformers_for_Language_Understanding"),
|
|
423
|
-
("nasa_ads", "https://ui.adsabs.harvard.edu/abs/2017Natur.549...80K/abstract"),
|
|
424
|
-
("elife", "https://elifesciences.org/articles/65774"),
|
|
425
|
-
("mdpi", "https://www.mdpi.com/1422-0067/22/6/2913"),
|
|
426
|
-
|
|
427
|
-
# --- Tier 3: News, Media, and Archives ---
|
|
428
|
-
("guardian", "https://www.theguardian.com/science/2021/mar/11/new-light-shed-on-mystery-of-great-unconformity-in-grand-canyon"),
|
|
429
|
-
("nytimes", "https://www.nytimes.com/2021/03/10/science/ancient-dna-mammoths.html"),
|
|
430
|
-
("internet_archive", "https://archive.org/details/TheAdventuresOfTomSawyer_201303"),
|
|
431
|
-
("bbc_news", "https://www.bbc.com/news/science-environment-56328948"),
|
|
432
|
-
("reuters", "https://www.reuters.com/lifestyle/science/scientists-find-oldest-known-dna-ancient-mammoth-teeth-2021-02-17/"),
|
|
433
|
-
("wired", "https://www.wired.com/story/the-teeny-tiny-master-of-the-energy-universe/"),
|
|
434
|
-
("youtube", "https://www.youtube.com/watch?v=1t_Co0g_t4Y"), # Test for video metadata
|
|
435
|
-
|
|
436
|
-
# --- Tier 4: Government, Institutional, and Niche Repositories ---
|
|
437
|
-
("world_bank_okr", "https://openknowledge.worldbank.org/handle/10986/35223"),
|
|
438
|
-
("github", "https://github.com/zotero/zotero"), # Test for software citation
|
|
439
|
-
("imdb", "https://www.imdb.com/title/tt0111161/"), # Test for film metadata
|
|
440
|
-
("wikipedia", "https://en.wikipedia.org/wiki/Zotero"), # Test for encyclopedia articles
|
|
441
|
-
("hathitrust", "https://babel.hathitrust.org/cgi/pt?id=uc1.b4000000&view=1up&seq=7"),
|
|
442
|
-
("dpla", "https://dp.la/item/05001541315c147233a0f6b8b98a39e3"), # Digital Public Library of America
|
|
443
|
-
|
|
444
|
-
# --- Tier 5: International Sources ---
|
|
445
|
-
("cnki", "https://en.cnki.com.cn/Article_en/CJFDTotal-ZGNY202102001.htm"), # China National Knowledge Infrastructure
|
|
446
|
-
("ciNii", "https://ci.nii.ac.jp/naid/130007997939"), # Japanese Institutional Repositories
|
|
447
|
-
("scielo", "https://www.scielo.br/j/rbz/a/F9S3gY8YxGqXyQ5Z8zW3vXg/?lang=en"), # SciELO Brazil
|
|
448
|
-
|
|
449
|
-
# --- Final Validation: No Match Case ---
|
|
450
|
-
("no_match", "https://www.google.com"),
|
|
451
|
-
]
|
|
452
|
-
|
|
453
|
-
# --- Pytest Fixtures ---
|
|
454
|
-
|
|
455
|
-
@pytest.fixture(scope="session")
|
|
456
|
-
def event_loop():
|
|
457
|
-
"""Create an instance of the default event loop for the session."""
|
|
458
|
-
loop = asyncio.get_event_loop_policy().new_event_loop()
|
|
459
|
-
yield loop
|
|
460
|
-
loop.close()
|
|
461
|
-
|
|
462
|
-
@pytest.fixture(scope="session")
|
|
463
|
-
def runner() -> ZoteroTranslatorRunner:
|
|
464
|
-
"""Provides a single instance of the ZoteroTranslatorRunner for all tests."""
|
|
465
|
-
print("\n--- Initializing ZoteroTranslatorRunner ---")
|
|
466
|
-
return ZoteroTranslatorRunner()
|
|
467
|
-
|
|
468
|
-
@pytest.fixture(scope="session")
|
|
469
|
-
async def browser() -> Browser:
|
|
470
|
-
"""Launches a single browser instance for the entire test session."""
|
|
471
|
-
async with async_playwright() as p:
|
|
472
|
-
print("\n--- Launching Browser ---")
|
|
473
|
-
browser_instance = await p.chromium.launch(headless=True)
|
|
474
|
-
yield browser_instance
|
|
475
|
-
print("\n--- Closing Browser ---")
|
|
476
|
-
await browser_instance.close()
|
|
477
|
-
|
|
478
|
-
@pytest.fixture
|
|
479
|
-
async def page(browser: Browser) -> Page:
|
|
480
|
-
"""Creates a new page for each test case."""
|
|
481
|
-
page = await browser.new_page()
|
|
482
|
-
yield page
|
|
483
|
-
await page.close()
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
# --- Test Functions ---
|
|
487
|
-
|
|
488
|
-
@pytest.mark.parametrize("test_id, url", TEST_CASES)
|
|
489
|
-
async def test_translator_execution(runner: ZoteroTranslatorRunner, page: Page, test_id: str, url: str):
|
|
490
|
-
"""
|
|
491
|
-
Tests the Zotero runner against a specific URL.
|
|
492
|
-
|
|
493
|
-
Args:
|
|
494
|
-
runner: The ZoteroTranslatorRunner instance.
|
|
495
|
-
page: The Playwright page object for the test.
|
|
496
|
-
test_id: A friendly name for the test case.
|
|
497
|
-
url: The URL to test the translator against.
|
|
498
|
-
"""
|
|
499
|
-
print(f"\n[Testing: {test_id}] Navigating to: {url}")
|
|
500
|
-
|
|
501
|
-
try:
|
|
502
|
-
await page.goto(url, wait_until="domcontentloaded", timeout=90000)
|
|
503
|
-
# Add a longer delay for complex sites or those with anti-bot measures
|
|
504
|
-
await page.wait_for_timeout(5000)
|
|
505
|
-
except Exception as e:
|
|
506
|
-
pytest.fail(f"Failed to navigate to {url}: {e}")
|
|
507
|
-
|
|
508
|
-
# Execute the translator runner
|
|
509
|
-
extracted_urls = await runner.extract_urls_pdf_async(page)
|
|
510
|
-
|
|
511
|
-
# Assert the expected outcome
|
|
512
|
-
if test_id == "no_match":
|
|
513
|
-
assert extracted_urls is not None, "Runner should not crash on no match"
|
|
514
|
-
assert len(extracted_urls) == 0, f"Expected no URLs for {test_id}, but found {len(extracted_urls)}"
|
|
515
|
-
print(f"✅ [SUCCESS: {test_id}] Correctly found no matching translator.")
|
|
516
|
-
else:
|
|
517
|
-
assert extracted_urls is not None, "Runner should return a list, not None"
|
|
518
|
-
assert len(extracted_urls) > 0, f"Expected at least one URL for {test_id}, but found none."
|
|
519
|
-
|
|
520
|
-
first_url = extracted_urls[0]
|
|
521
|
-
assert isinstance(first_url, str), "Result should be a list of strings"
|
|
522
|
-
|
|
523
|
-
# For non-academic or non-document sources, a PDF link might not exist.
|
|
524
|
-
# We'll assert that a URL was found, but only check for 'pdf' in relevant academic sources.
|
|
525
|
-
is_academic_document = test_id not in ["guardian", "nytimes", "internet_archive", "bbc_news", "reuters", "wired", "youtube", "github", "imdb", "wikipedia", "dpla"]
|
|
526
|
-
if is_academic_document:
|
|
527
|
-
# Some academic pages link to the abstract/full-text HTML first.
|
|
528
|
-
# A successful run is finding *any* relevant link, not strictly a PDF.
|
|
529
|
-
# We will check if the URL is at least a plausible document link.
|
|
530
|
-
assert any(kw in first_url.lower() for kw in ["pdf", "doi", "abs", "full", "content"]), \
|
|
531
|
-
f"Expected a document-like URL for {test_id}, but got: {first_url}"
|
|
532
|
-
|
|
533
|
-
print(f"✅ [SUCCESS: {test_id}] Found {len(extracted_urls)} URL(s). First URL: {first_url}")
|
|
534
|
-
|
|
535
|
-
# To run this script directly for demonstration purposes
|
|
536
|
-
if __name__ == "__main__":
|
|
537
|
-
print("This is a pytest script. To run the tests, execute 'pytest' in your terminal.")
|
|
538
|
-
print("Example Test Cases:")
|
|
539
|
-
for test_id, url in TEST_CASES:
|
|
540
|
-
print(f"- {test_id.capitalize()}: {url}")
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
could you check test cases in the javascript contents? I think we can make use of them to automate test in python as well
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
Show thinking
|
|
559
|
-
Of course. You've pointed out a fantastic feature of the Zotero translator files that we can absolutely leverage. Many of the .js files contain their own built-in test cases, which are a goldmine for creating a truly comprehensive and automated test suite.
|
|
560
|
-
|
|
561
|
-
Your idea to use these is excellent because it allows us to:
|
|
562
|
-
|
|
563
|
-
Automate Test Creation: We don't have to manually find URLs for every single translator.
|
|
564
|
-
|
|
565
|
-
Verify Exact Outcomes: The embedded tests often specify the exact metadata that should be extracted, allowing for much more precise assertions than just checking if a PDF URL was found.
|
|
566
|
-
|
|
567
|
-
Ensure Long-Term Stability: As you update your runner or the translators themselves, this automated suite will immediately catch any regressions.
|
|
568
|
-
|
|
569
|
-
## The Automated Approach
|
|
570
|
-
Here is the strategy to integrate these embedded test cases into your Python test script:
|
|
571
|
-
|
|
572
|
-
Discover Translators: The script will first scan your zotero_translators directory to find all .js files.
|
|
573
|
-
|
|
574
|
-
Parse Embedded Tests: For each file, it will read the content and look for the /** BEGIN TEST CASES **/ block. It will then parse the JSON array found inside.
|
|
575
|
-
|
|
576
|
-
Dynamically Generate Pytest Cases: The script will dynamically create parameterized test cases for pytest based on the URLs and expected outcomes from the parsed JSON.
|
|
577
|
-
|
|
578
|
-
Execute and Assert: The existing pytest framework will then run these generated tests. The assertions will be modified to check not just for success, but for the correctness of the extracted data when available.
|
|
579
|
-
|
|
580
|
-
This makes the entire process scalable and self-maintaining.
|
|
581
|
-
|
|
582
|
-
## The Enhanced pytest Script
|
|
583
|
-
I've modified the test_zotero_runner.py script to automatically discover and use these embedded test cases. This new version is much more powerful.
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
``` python
|
|
587
|
-
#!/usr/bin/env python3
|
|
588
|
-
# -*- coding: utf-8 -*-
|
|
589
|
-
# File: test_zotero_runner.py
|
|
590
|
-
# ----------------------------------------
|
|
591
|
-
"""
|
|
592
|
-
Comprehensive Test Suite for ZoteroTranslatorRunner.
|
|
593
|
-
|
|
594
|
-
This script uses pytest to validate that the ZoteroTranslatorRunner can
|
|
595
|
-
correctly execute translators. It dynamically discovers translators and their
|
|
596
|
-
embedded test cases, creating a robust and automated testing framework.
|
|
597
|
-
|
|
598
|
-
Usage:
|
|
599
|
-
1. Make sure you have pytest and pytest-asyncio installed:
|
|
600
|
-
pip install pytest pytest-asyncio playwright
|
|
601
|
-
playwright install
|
|
602
|
-
2. Place this file in a 'tests' directory.
|
|
603
|
-
3. Ensure your _ZoteroTranslatorRunner.py and the 'zotero_translators'
|
|
604
|
-
directory are accessible from your project root.
|
|
605
|
-
4. Run the tests from your project's root directory:
|
|
606
|
-
pytest
|
|
607
|
-
"""
|
|
608
|
-
|
|
609
|
-
import asyncio
|
|
610
|
-
import pytest
|
|
611
|
-
import os
|
|
612
|
-
import json
|
|
613
|
-
import re
|
|
614
|
-
from playwright.async_api import Page, Browser, async_playwright
|
|
615
|
-
|
|
616
|
-
# Note: ZoteroTranslatorRunner is now deprecated. Use the Python-based zotero_translators instead.
|
|
617
|
-
# Old location (deprecated): from scitex.scholar.url.helpers.finders._ZoteroTranslatorRunner import ZoteroTranslatorRunner
|
|
618
|
-
# New location: Use scitex.scholar.url.strategies.zotero_translators for Python-based implementation
|
|
619
|
-
|
|
620
|
-
# --- Constants ---
|
|
621
|
-
TRANSLATORS_DIR = os.path.join(os.path.dirname(__file__), '..', 'src', 'scitex', 'scholar', 'url', 'strategies', 'zotero_translators')
|
|
622
|
-
|
|
623
|
-
# --- Test Case Discovery ---
|
|
624
|
-
|
|
625
|
-
def discover_translator_test_cases():
|
|
626
|
-
"""
|
|
627
|
-
Scans the translators directory, parses embedded test cases from each .js file,
|
|
628
|
-
and yields them for pytest parametrization.
|
|
629
|
-
"""
|
|
630
|
-
if not os.path.isdir(TRANSLATORS_DIR):
|
|
631
|
-
print(f"\nWarning: Translators directory not found at {TRANSLATORS_DIR}")
|
|
632
|
-
return
|
|
633
|
-
|
|
634
|
-
for filename in os.listdir(TRANSLATORS_DIR):
|
|
635
|
-
if filename.endswith(".js"):
|
|
636
|
-
filepath = os.path.join(TRANSLATORS_DIR, filename)
|
|
637
|
-
with open(filepath, 'r', encoding='utf-8') as f:
|
|
638
|
-
content = f.read()
|
|
639
|
-
# Use a non-greedy match to find the test case block
|
|
640
|
-
match = re.search(r'/\*\* BEGIN TEST CASES \*\*/\s*(.*?)\s*/\*\* END TEST CASES \*\*/', content, re.DOTALL)
|
|
641
|
-
if match:
|
|
642
|
-
test_case_json_str = match.group(1)
|
|
643
|
-
try:
|
|
644
|
-
# Some test cases are not valid JSON arrays, but a series of objects.
|
|
645
|
-
# We'll wrap them in brackets to handle this.
|
|
646
|
-
if not test_case_json_str.strip().startswith('['):
|
|
647
|
-
test_case_json_str = f"[{test_case_json_str}]"
|
|
648
|
-
|
|
649
|
-
test_cases = json.loads(test_case_json_str)
|
|
650
|
-
for i, test_case in enumerate(test_cases):
|
|
651
|
-
if test_case.get("type") == "web" and test_case.get("url"):
|
|
652
|
-
test_id = f"{filename.replace('.js', '')}-{i}"
|
|
653
|
-
yield pytest.param(test_id, test_case['url'], test_case['items'], id=test_id)
|
|
654
|
-
except json.JSONDecodeError as e:
|
|
655
|
-
print(f"\nWarning: Could not parse test cases in {filename}: {e}")
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
# --- Pytest Fixtures ---
|
|
659
|
-
|
|
660
|
-
@pytest.fixture(scope="session")
|
|
661
|
-
def event_loop():
|
|
662
|
-
loop = asyncio.get_event_loop_policy().new_event_loop()
|
|
663
|
-
yield loop
|
|
664
|
-
loop.close()
|
|
665
|
-
|
|
666
|
-
@pytest.fixture(scope="session")
|
|
667
|
-
def runner() -> ZoteroTranslatorRunner:
|
|
668
|
-
print("\n--- Initializing ZoteroTranslatorRunner ---")
|
|
669
|
-
return ZoteroTranslatorRunner()
|
|
670
|
-
|
|
671
|
-
@pytest.fixture(scope="session")
|
|
672
|
-
async def browser() -> Browser:
|
|
673
|
-
async with async_playwright() as p:
|
|
674
|
-
print("\n--- Launching Browser ---")
|
|
675
|
-
browser_instance = await p.chromium.launch(headless=True)
|
|
676
|
-
yield browser_instance
|
|
677
|
-
print("\n--- Closing Browser ---")
|
|
678
|
-
await browser_instance.close()
|
|
679
|
-
|
|
680
|
-
@pytest.fixture
|
|
681
|
-
async def page(browser: Browser) -> Page:
|
|
682
|
-
page = await browser.new_page()
|
|
683
|
-
yield page
|
|
684
|
-
await page.close()
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
# --- Test Functions ---
|
|
688
|
-
|
|
689
|
-
@pytest.mark.parametrize("test_id, url, expected_items", discover_translator_test_cases())
|
|
690
|
-
async def test_translator_execution(runner: ZoteroTranslatorRunner, page: Page, test_id: str, url: str, expected_items):
|
|
691
|
-
"""
|
|
692
|
-
Tests the Zotero runner against a specific URL from an embedded test case.
|
|
693
|
-
"""
|
|
694
|
-
print(f"\n[Testing: {test_id}] Navigating to: {url}")
|
|
695
|
-
|
|
696
|
-
try:
|
|
697
|
-
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
698
|
-
await page.wait_for_timeout(3000)
|
|
699
|
-
except Exception as e:
|
|
700
|
-
pytest.fail(f"Failed to navigate to {url}: {e}")
|
|
701
|
-
|
|
702
|
-
# Execute the translator runner
|
|
703
|
-
extracted_urls = await runner.extract_urls_pdf_async(page)
|
|
704
|
-
|
|
705
|
-
# Assert the expected outcome based on the embedded test case
|
|
706
|
-
if expected_items == "multiple":
|
|
707
|
-
assert extracted_urls is not None, "Runner should not crash"
|
|
708
|
-
assert len(extracted_urls) > 0, f"Expected multiple items for {test_id}, but found none."
|
|
709
|
-
print(f"✅ [SUCCESS: {test_id}] Correctly identified as multiple.")
|
|
710
|
-
elif isinstance(expected_items, list) and len(expected_items) == 0:
|
|
711
|
-
assert len(extracted_urls) == 0, f"Expected no items for {test_id}, but found {len(extracted_urls)}."
|
|
712
|
-
print(f"✅ [SUCCESS: {test_id}] Correctly found no items.")
|
|
713
|
-
elif isinstance(expected_items, list) and len(expected_items) > 0:
|
|
714
|
-
assert extracted_urls is not None, "Runner should return a list, not None"
|
|
715
|
-
assert len(extracted_urls) > 0, f"Expected at least one URL for {test_id}, but found none."
|
|
716
|
-
|
|
717
|
-
first_url = extracted_urls[0]
|
|
718
|
-
assert isinstance(first_url, str), "Result should be a list of strings"
|
|
719
|
-
|
|
720
|
-
# A simple check for now, can be expanded to compare metadata
|
|
721
|
-
print(f"✅ [SUCCESS: {test_id}] Found {len(extracted_urls)} URL(s). First URL: {first_url}")
|
|
722
|
-
else:
|
|
723
|
-
# This covers cases where 'items' might be something else, like a boolean.
|
|
724
|
-
# We can add more specific assertions here as needed.
|
|
725
|
-
print(f"✅ [SKIPPED: {test_id}] Non-standard 'items' field in test case: {expected_items}")
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
if __name__ == "__main__":
|
|
729
|
-
print("This is a pytest script. To run the tests, execute 'pytest' in your terminal.")
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
Yes, the translatorType in the header of each JavaScript file is a crucial piece of metadata that tells Zotero what kind of translator it is and how it should be used. It's a numeric code where different numbers (and their combinations) represent different functionalities.
|
|
733
|
-
|
|
734
|
-
Based on your search results, here is a breakdown of the patterns and what they mean.
|
|
735
|
-
|
|
736
|
-
Translator Type Categories
|
|
737
|
-
The translatorType is a bitwise flag. This means that a single number can represent multiple capabilities by adding up the values of the types it includes.
|
|
738
|
-
|
|
739
|
-
"translatorType": 4 (Web Scraper)
|
|
740
|
-
|
|
741
|
-
Meaning: This is by far the most common type in your collection. It signifies a standard web translator designed to run on a live webpage.
|
|
742
|
-
|
|
743
|
-
Function: It uses the detectWeb() and doWeb() entry points to scrape metadata directly from the HTML of a webpage.
|
|
744
|
-
|
|
745
|
-
Your Use Case: These are the primary translators your Python runner is designed to execute.
|
|
746
|
-
|
|
747
|
-
"translatorType": 1 (Import)
|
|
748
|
-
|
|
749
|
-
Meaning: This is an import translator. It's not designed to read a live webpage.
|
|
750
|
-
|
|
751
|
-
Function: Its purpose is to parse a file or a block of text that is already in a structured format (like BibTeX, RIS, or MARC) and convert it into Zotero items. For example, BibTeX.js would be an import translator.
|
|
752
|
-
|
|
753
|
-
Your Use Case: Your runner would generally not use these, as you are starting from a URL, not a text file.
|
|
754
|
-
|
|
755
|
-
"translatorType": 2 (Export)
|
|
756
|
-
|
|
757
|
-
Meaning: This is an export translator.
|
|
758
|
-
|
|
759
|
-
Function: It does the reverse of an import translator. It takes Zotero items and converts them into a specific text-based format (like a BibTeX file or a formatted bibliography). BibLaTeX.js is a perfect example.
|
|
760
|
-
|
|
761
|
-
Your Use Case: These are not relevant to your scraping task.
|
|
762
|
-
|
|
763
|
-
"translatorType": 8 (Search)
|
|
764
|
-
|
|
765
|
-
Meaning: This is a search translator.
|
|
766
|
-
|
|
767
|
-
Function: It's designed to work with library database search protocols like SRU or Z39.50. It doesn't scrape a webpage but rather sends a structured query to a database and parses the structured response. Your ADS Bibcode.js and various ISBN translators fall into this category.
|
|
768
|
-
|
|
769
|
-
Your Use Case: Your current runner doesn't support this, as it would require implementing the specific search protocols.
|
|
770
|
-
|
|
771
|
-
"translatorType": 12 (Web Scraper + Search)
|
|
772
|
-
|
|
773
|
-
Meaning: This is a hybrid translator. The number 12 is the sum of 4 (Web) + 8 (Search).
|
|
774
|
-
|
|
775
|
-
Function: This translator can both scrape a live webpage and perform a direct search query. PubMed.js and arXiv.org.js are great examples. They can scrape a single article page but can also be used by Zotero to search the entire PubMed or arXiv database.
|
|
776
|
-
|
|
777
|
-
Your Use Case: Your runner can successfully use the web scraping (4) part of these translators.
|
|
778
|
-
|
|
779
|
-
Summary of Patterns
|
|
780
|
-
For your project, the most important distinction is that your runner is built to handle translators where the type is 4 or includes 4 (like 12). The other types (1, 2, 3, 8) are for different functions within Zotero (importing, exporting, and specialized searching) and won't be executed by your current web-based automation script.
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
so, all we need is concatelate on 4 and 12;
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
Then, another question is that do you think how many and which translators are vital for us?
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
Show thinking
|
|
798
|
-
You've asked two excellent strategic questions. Here’s a breakdown.
|
|
799
|
-
|
|
800
|
-
Focusing on Translator Types 4 and 12
|
|
801
|
-
Yes, your understanding is spot on. For your specific goal of building a web scraper that starts from a URL, you should focus exclusively on translators with translatorType 4 (Web) and 12 (Web + Search).
|
|
802
|
-
|
|
803
|
-
Why? Your runner is designed to operate on live webpages. Only translators marked as "Web" (type 4) are built for this purpose. Type 12 is just a "Web" translator that also has a "Search" capability, so it's perfectly compatible with your system.
|
|
804
|
-
|
|
805
|
-
Action: You can modify your Python script that loads the translators to simply skip any file where the translatorType is not 4 or 12. This will make your system more efficient and prevent potential errors from trying to run incompatible translator types.
|
|
806
|
-
|
|
807
|
-
Which Translators are Vital for Academic Use?
|
|
808
|
-
You have over 700 translators, but you don't need to ensure all of them work perfectly to have a highly effective system. The Pareto Principle (80/20 rule) applies here: a small number of key translators will cover the vast majority of academic sources you'll encounter.
|
|
809
|
-
|
|
810
|
-
Here is a prioritized list of the most vital translators to focus on for academic purposes:
|
|
811
|
-
|
|
812
|
-
Tier 1: The Core Giants (Must-Haves)
|
|
813
|
-
These platforms host a massive percentage of all modern scientific literature. Ensuring these work flawlessly is your top priority.
|
|
814
|
-
|
|
815
|
-
ScienceDirect.js: For Elsevier journals.
|
|
816
|
-
|
|
817
|
-
Springer Link.js: For Springer content.
|
|
818
|
-
|
|
819
|
-
Wiley Online Library.js: For Wiley journals.
|
|
820
|
-
|
|
821
|
-
Taylor and Francis+NEJM.js: For Taylor & Francis and the New England Journal of Medicine.
|
|
822
|
-
|
|
823
|
-
SAGE Journals.js: For SAGE publications.
|
|
824
|
-
|
|
825
|
-
Tier 2: Major Aggregators & Databases
|
|
826
|
-
These are the central hubs where researchers find papers from many different publishers.
|
|
827
|
-
|
|
828
|
-
PubMed.js: The essential database for all biomedical research.
|
|
829
|
-
|
|
830
|
-
Google Scholar.js: The most widely used academic search engine.
|
|
831
|
-
|
|
832
|
-
JSTOR.js: A critical archive for humanities and social sciences.
|
|
833
|
-
|
|
834
|
-
arXiv.org.js: The main preprint server for physics, computer science, and math.
|
|
835
|
-
|
|
836
|
-
Project MUSE.js: Another key aggregator for humanities and social sciences.
|
|
837
|
-
|
|
838
|
-
Tier 3: High-Impact Publishers & Societies
|
|
839
|
-
These are individual publishers that are extremely influential and common.
|
|
840
|
-
|
|
841
|
-
Nature Publishing Group.js: For Nature and its associated journals.
|
|
842
|
-
|
|
843
|
-
ACS Publications.js: American Chemical Society.
|
|
844
|
-
|
|
845
|
-
ACM Digital Library.js: Association for Computing Machinery.
|
|
846
|
-
|
|
847
|
-
IEEE Xplore.js: Institute of Electrical and Electronics Engineers.
|
|
848
|
-
|
|
849
|
-
Oxford University Press.js
|
|
850
|
-
|
|
851
|
-
Cambridge Core.js
|
|
852
|
-
|
|
853
|
-
Tier 4: Key Platforms & Open Access Hubs
|
|
854
|
-
These translators cover platforms that host many journals or are crucial for finding open-access versions.
|
|
855
|
-
|
|
856
|
-
HighWire 2.0.js: A platform used by many societies (and for preprint servers like bioRxiv).
|
|
857
|
-
|
|
858
|
-
Atypon Journals.js: Another very large platform provider.
|
|
859
|
-
|
|
860
|
-
BioMed Central.js and PLoS Journals.js: Major open-access publishers.
|
|
861
|
-
|
|
862
|
-
By focusing your testing and development efforts on this "vital list" of about 20-25 translators, you will likely cover over 80% of the academic articles your system will need to process.
|
|
863
|
-
```
|
|
864
|
-
|
|
865
|
-
<!-- EOF -->
|