commonmeta-py 0.106__py3-none-any.whl → 0.108__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. commonmeta/__init__.py +12 -3
  2. commonmeta/api_utils.py +3 -2
  3. commonmeta/base_utils.py +186 -3
  4. commonmeta/cli.py +114 -34
  5. commonmeta/constants.py +20 -0
  6. commonmeta/file_utils.py +112 -0
  7. commonmeta/metadata.py +102 -42
  8. commonmeta/readers/codemeta_reader.py +1 -1
  9. commonmeta/readers/crossref_reader.py +23 -10
  10. commonmeta/readers/crossref_xml_reader.py +1 -1
  11. commonmeta/readers/datacite_reader.py +6 -4
  12. commonmeta/readers/{json_feed_reader.py → jsonfeed_reader.py} +12 -12
  13. commonmeta/resources/crossref/common5.4.0.xsd +1264 -0
  14. commonmeta/resources/crossref/{crossref5.3.1.xsd → crossref5.4.0.xsd} +286 -88
  15. commonmeta/resources/crossref/doi_resources5.4.0.xsd +117 -0
  16. commonmeta/resources/crossref/fundingdata5.4.0.xsd +59 -0
  17. commonmeta/resources/crossref/fundref.xsd +29 -19
  18. commonmeta/resources/crossref/languages5.4.0.xsd +8119 -0
  19. commonmeta/resources/crossref/mediatypes5.4.0.xsd +2207 -0
  20. commonmeta/resources/crossref/module-ali.xsd +14 -6
  21. commonmeta/resources/crossref/standard-modules/mathml3/mathml3-common.xsd +101 -0
  22. commonmeta/resources/crossref/standard-modules/mathml3/mathml3-content.xsd +683 -0
  23. commonmeta/resources/crossref/standard-modules/mathml3/mathml3-presentation.xsd +2092 -0
  24. commonmeta/resources/crossref/standard-modules/mathml3/mathml3-strict-content.xsd +186 -0
  25. commonmeta/resources/crossref/standard-modules/mathml3/mathml3.xsd +9 -0
  26. commonmeta/resources/crossref/standard-modules/mathml3/module-ali.xsd +47 -0
  27. commonmeta/resources/crossref/standard-modules/module-ali.xsd +47 -0
  28. commonmeta/resources/crossref/standard-modules/xlink.xsd +100 -0
  29. commonmeta/resources/crossref/standard-modules/xml.xsd +287 -0
  30. commonmeta/resources/crossref/xml.xsd +287 -0
  31. commonmeta/schema_utils.py +25 -0
  32. commonmeta/utils.py +90 -15
  33. commonmeta/writers/bibtex_writer.py +5 -5
  34. commonmeta/writers/citation_writer.py +10 -5
  35. commonmeta/writers/commonmeta_writer.py +5 -17
  36. commonmeta/writers/crossref_xml_writer.py +1032 -4
  37. commonmeta/writers/csl_writer.py +6 -6
  38. commonmeta/writers/datacite_writer.py +11 -6
  39. commonmeta/writers/inveniordm_writer.py +286 -10
  40. commonmeta/writers/ris_writer.py +3 -3
  41. commonmeta/writers/schema_org_writer.py +10 -5
  42. {commonmeta_py-0.106.dist-info → commonmeta_py-0.108.dist-info}/METADATA +5 -2
  43. {commonmeta_py-0.106.dist-info → commonmeta_py-0.108.dist-info}/RECORD +46 -32
  44. commonmeta/crossref_utils.py +0 -583
  45. commonmeta/resources/crossref/common5.3.1.xsd +0 -1538
  46. {commonmeta_py-0.106.dist-info → commonmeta_py-0.108.dist-info}/WHEEL +0 -0
  47. {commonmeta_py-0.106.dist-info → commonmeta_py-0.108.dist-info}/entry_points.txt +0 -0
  48. {commonmeta_py-0.106.dist-info → commonmeta_py-0.108.dist-info}/licenses/LICENSE +0 -0
commonmeta/__init__.py CHANGED
@@ -10,7 +10,7 @@ commonmeta-py is a Python library to convert scholarly metadata
10
10
  """
11
11
 
12
12
  __title__ = "commonmeta-py"
13
- __version__ = "0.106"
13
+ __version__ = "0.108"
14
14
  __author__ = "Martin Fenner"
15
15
  __license__ = "MIT"
16
16
 
@@ -54,6 +54,14 @@ from .doi_utils import (
54
54
  validate_doi,
55
55
  validate_prefix,
56
56
  )
57
+ from .file_utils import (
58
+ download_file,
59
+ read_file,
60
+ read_gz_file,
61
+ read_zip_file,
62
+ uncompress_content,
63
+ unzip_content,
64
+ )
57
65
  from .metadata import Metadata, MetadataList
58
66
  from .readers import (
59
67
  cff_reader,
@@ -63,7 +71,7 @@ from .readers import (
63
71
  datacite_reader,
64
72
  datacite_xml_reader,
65
73
  inveniordm_reader,
66
- json_feed_reader,
74
+ jsonfeed_reader,
67
75
  kbase_reader,
68
76
  openalex_reader,
69
77
  ris_reader,
@@ -75,7 +83,7 @@ from .utils import (
75
83
  extract_url,
76
84
  extract_urls,
77
85
  from_csl,
78
- from_json_feed,
86
+ from_jsonfeed,
79
87
  from_schema_org,
80
88
  get_language,
81
89
  issn_as_url,
@@ -97,6 +105,7 @@ from .writers import (
97
105
  bibtex_writer,
98
106
  citation_writer,
99
107
  commonmeta_writer,
108
+ crossref_xml_writer,
100
109
  csl_writer,
101
110
  datacite_writer,
102
111
  ris_writer,
commonmeta/api_utils.py CHANGED
@@ -7,8 +7,9 @@ import jwt
7
7
  import requests
8
8
  from furl import furl
9
9
 
10
+ from commonmeta.readers.jsonfeed_reader import get_jsonfeed_uuid
11
+
10
12
  from .doi_utils import doi_as_url, validate_doi
11
- from .readers.json_feed_reader import get_json_feed_item_uuid
12
13
 
13
14
 
14
15
  def generate_ghost_token(key: str) -> str:
@@ -34,7 +35,7 @@ def update_ghost_post_via_api(
34
35
  """Update Ghost post via API"""
35
36
  # get post doi and url from Rogue Scholar API
36
37
  # post url is needed to find post via Ghost API
37
- post = get_json_feed_item_uuid(_id)
38
+ post = get_jsonfeed_uuid(_id)
38
39
  if post.get("error", None):
39
40
  return post
40
41
  doi = validate_doi(post.get("doi", None))
commonmeta/base_utils.py CHANGED
@@ -2,10 +2,13 @@
2
2
 
3
3
  import html
4
4
  import re
5
+ import uuid
6
+ from datetime import datetime
5
7
  from os import path
6
8
  from typing import Optional, Union
7
9
 
8
10
  import nh3
11
+ import pydash as py_
9
12
  import xmltodict
10
13
 
11
14
 
@@ -67,8 +70,8 @@ def parse_attributes(
67
70
 
68
71
 
69
72
  def parse_xml(string: Optional[str], **kwargs) -> Optional[Union[dict, list]]:
70
- """Parse XML into dict. Set default options, and options for Crossref XML"""
71
- if string is None:
73
+ """Parse XML into dict using xmltodict. Set default options, and options for Crossref XML"""
74
+ if string is None or string == "{}":
72
75
  return None
73
76
  if path.exists(string):
74
77
  with open(string, encoding="utf-8") as file:
@@ -77,7 +80,7 @@ def parse_xml(string: Optional[str], **kwargs) -> Optional[Union[dict, list]]:
77
80
  if kwargs.get("dialect", None) == "crossref":
78
81
  # remove namespaces from xml
79
82
  namespaces = {
80
- "http://www.crossref.org/schema/5.3.1": None,
83
+ "http://www.crossref.org/schema/5.4.0": None,
81
84
  "http://www.crossref.org/qrschema/3.0": None,
82
85
  "http://www.crossref.org/xschema/1.0": None,
83
86
  "http://www.crossref.org/xschema/1.1": None,
@@ -93,6 +96,7 @@ def parse_xml(string: Optional[str], **kwargs) -> Optional[Union[dict, list]]:
93
96
  "person_name",
94
97
  "organization",
95
98
  "titles",
99
+ "abstract",
96
100
  "item",
97
101
  "citation",
98
102
  "program",
@@ -105,6 +109,172 @@ def parse_xml(string: Optional[str], **kwargs) -> Optional[Union[dict, list]]:
105
109
  return xmltodict.parse(string, **kwargs)
106
110
 
107
111
 
112
+ def unparse_xml(input: Optional[dict], **kwargs) -> str:
113
+ """Unparse (dump) dict into XML using xmltodict. Set default options, and options for Crossref XML"""
114
+ if input is None:
115
+ return None
116
+ if kwargs.get("dialect", None) == "crossref":
117
+ # Add additional logic for crossref dialect
118
+ # add body and root element as wrapping elements
119
+ type = next(iter(input))
120
+ attributes = input.get(type)
121
+ input.pop(type)
122
+
123
+ if type == "book":
124
+ book_metadata = py_.get(input, "book_metadata") or {}
125
+ input.pop("book_metadata")
126
+ book_metadata = {**book_metadata, **input}
127
+ input = {"book": {**attributes, "book_metadata": book_metadata}}
128
+ elif type == "database":
129
+ database_metadata = py_.get(input, "database_metadata") or {}
130
+ input.pop("database_metadata")
131
+ val = input.pop("publisher_item")
132
+ institution = input.pop("institution", None)
133
+ database_metadata = {**{"titles": val}, **database_metadata}
134
+ database_metadata["institution"] = institution or {}
135
+ component = input.pop("component", None)
136
+ input = {
137
+ "database": {
138
+ **attributes,
139
+ "database_metadata": database_metadata,
140
+ "component_list": {"component": component | input},
141
+ }
142
+ }
143
+ elif type == "journal":
144
+ journal_metadata = py_.get(input, "journal_metadata") or {}
145
+ journal_issue = py_.get(input, "journal_issue") or {}
146
+ journal_article = py_.get(input, "journal_article") or {}
147
+ input.pop("journal_metadata")
148
+ input.pop("journal_issue")
149
+ input.pop("journal_article")
150
+ input = {
151
+ "journal": {
152
+ "journal_metadata": journal_metadata,
153
+ "journal_issue": journal_issue,
154
+ "journal_article": journal_article | input,
155
+ }
156
+ }
157
+ elif type == "proceedings_article":
158
+ proceedings_metadata = py_.get(input, "proceedings_metadata") or {}
159
+ input.pop("proceedings_metadata")
160
+ input = {
161
+ "proceedings": {
162
+ **attributes,
163
+ "proceedings_metadata": proceedings_metadata,
164
+ "conference_paper": input,
165
+ }
166
+ }
167
+ elif type == "sa_component":
168
+ component = py_.get(input, "component") or {}
169
+ input.pop("component")
170
+ input = {
171
+ "sa_component": {
172
+ **attributes,
173
+ "component_list": {"component": component | input},
174
+ }
175
+ }
176
+ else:
177
+ input = {type: attributes | input}
178
+
179
+ doi_batch = {
180
+ "@xmlns": "http://www.crossref.org/schema/5.4.0",
181
+ "@version": "5.4.0",
182
+ "head": get_crossref_xml_head(input),
183
+ "body": input,
184
+ }
185
+ input = {"doi_batch": doi_batch}
186
+ kwargs["pretty"] = True
187
+ kwargs["indent"] = " "
188
+ kwargs.pop("dialect", None)
189
+ return xmltodict.unparse(input, **kwargs)
190
+
191
+
192
+ def unparse_xml_list(input: Optional[list], **kwargs) -> str:
193
+ """Unparse (dump) list into XML using xmltodict. Set default options, and options for Crossref XML"""
194
+ if input is None:
195
+ return None
196
+ if kwargs.get("dialect", None) == "crossref":
197
+ # Add additional logic for crossref dialect
198
+ # add body and root element as wrapping elements
199
+
200
+ # Group items by type with minimal grouping
201
+ items_by_type = {}
202
+
203
+ for item in wrap(input):
204
+ type = next(iter(item))
205
+ attributes = item.get(type)
206
+ item.pop(type)
207
+
208
+ # handle nested book_metadata and journal structure as in unparse_xml
209
+ if type == "book":
210
+ book_metadata = py_.get(item, "book_metadata") or {}
211
+ item.pop("book_metadata")
212
+ book_metadata = {**book_metadata, **item}
213
+ item = {"book": {**attributes, "book_metadata": book_metadata}}
214
+ elif type == "database":
215
+ database_metadata = py_.get(item, "database_metadata") or {}
216
+ item.pop("database_metadata")
217
+ database_metadata = {**database_metadata, **item}
218
+ item = {
219
+ "database": {**attributes, "database_metadata": database_metadata}
220
+ }
221
+ elif type == "journal":
222
+ journal_metadata = py_.get(item, "journal_metadata") or {}
223
+ journal_issue = py_.get(item, "journal_issue") or {}
224
+ journal_article = py_.get(item, "journal_article") or {}
225
+ item.pop("journal_metadata")
226
+ item.pop("journal_issue")
227
+ item.pop("journal_article")
228
+ item = {
229
+ "journal": {
230
+ "journal_metadata": journal_metadata,
231
+ "journal_issue": journal_issue,
232
+ "journal_article": journal_article | item,
233
+ }
234
+ }
235
+ elif type == "sa_component":
236
+ component = py_.get(input, "component") or {}
237
+ item.pop("component")
238
+ item = {
239
+ "sa_component": {
240
+ **attributes,
241
+ "component_list": {"component": component | item},
242
+ }
243
+ }
244
+ else:
245
+ item = {type: attributes | item}
246
+
247
+ # Add item to appropriate type bucket
248
+ if type not in items_by_type:
249
+ items_by_type[type] = []
250
+ items_by_type[type].append(item[type])
251
+
252
+ # Create the final structure with body containing all grouped items
253
+ body_content = {}
254
+ for type_key, items in items_by_type.items():
255
+ if len(items) == 1:
256
+ body_content[type_key] = items[0] # Use single item without array
257
+ else:
258
+ body_content[type_key] = items # Use array when multiple items
259
+ head = kwargs["head"] or {}
260
+ doi_batch = {
261
+ "@xmlns": "http://www.crossref.org/schema/5.4.0",
262
+ "@xmlns:ai": "http://www.crossref.org/AccessIndicators.xsd",
263
+ "@xmlns:rel": "http://www.crossref.org/relations.xsd",
264
+ "@xmlns:fr": "http://www.crossref.org/fundref.xsd",
265
+ "@version": "5.4.0",
266
+ "head": get_crossref_xml_head(head),
267
+ "body": body_content,
268
+ }
269
+ output = {"doi_batch": doi_batch}
270
+
271
+ kwargs["pretty"] = True
272
+ kwargs["indent"] = " "
273
+ kwargs.pop("dialect", None)
274
+ kwargs.pop("head", None)
275
+ return xmltodict.unparse(output, **kwargs)
276
+
277
+
108
278
  def sanitize(text: str, **kwargs) -> str:
109
279
  """Sanitize text"""
110
280
  # default whitelisted HTML tags
@@ -122,3 +292,16 @@ def sanitize(text: str, **kwargs) -> str:
122
292
  string = nh3.clean(text, tags=tags, attributes=attributes, link_rel=None)
123
293
  # remove excessive internal whitespace
124
294
  return " ".join(re.split(r"\s+", string, flags=re.UNICODE))
295
+
296
+
297
+ def get_crossref_xml_head(metadata: dict) -> dict:
298
+ """Get head element for Crossref XML"""
299
+ return {
300
+ "doi_batch_id": str(uuid.uuid4()),
301
+ "timestamp": datetime.now().strftime("%Y%m%d%H%M%S"),
302
+ "depositor": {
303
+ "depositor_name": metadata.get("depositor", None) or "test",
304
+ "email_address": metadata.get("email", None) or "info@example.org",
305
+ },
306
+ "registrant": metadata.get("registrant", None) or "test",
307
+ }
commonmeta/cli.py CHANGED
@@ -2,16 +2,12 @@ import time
2
2
 
3
3
  import click
4
4
  import orjson as json
5
- import pydash as py_
6
5
 
7
6
  from commonmeta import Metadata, MetadataList # __version__
8
7
  from commonmeta.api_utils import update_ghost_post_via_api
9
8
  from commonmeta.doi_utils import decode_doi, encode_doi, validate_prefix
10
9
  from commonmeta.readers.crossref_reader import get_random_crossref_id
11
10
  from commonmeta.readers.datacite_reader import get_random_datacite_id
12
- from commonmeta.readers.json_feed_reader import (
13
- get_json_feed_item_uuid,
14
- )
15
11
  from commonmeta.readers.openalex_reader import get_random_openalex_id
16
12
 
17
13
 
@@ -46,6 +42,49 @@ def convert(
46
42
  email,
47
43
  registrant,
48
44
  show_errors,
45
+ ):
46
+ metadata = Metadata(input, via=via, doi=doi, prefix=prefix)
47
+ if show_errors and not metadata.is_valid:
48
+ raise click.ClickException(str(metadata.errors))
49
+
50
+ click.echo(
51
+ metadata.write(
52
+ to=to,
53
+ style=style,
54
+ locale=locale,
55
+ depositor=depositor,
56
+ email=email,
57
+ registrant=registrant,
58
+ )
59
+ )
60
+ if show_errors and metadata.write_errors:
61
+ raise click.ClickException(str(metadata.write_errors))
62
+
63
+
64
+ @cli.command()
65
+ @click.argument("input", type=str, required=True)
66
+ @click.option("--via", "-f", type=str, default=None)
67
+ @click.option("--to", "-t", type=str, default="commonmeta")
68
+ @click.option("--style", "-s", type=str, default="apa")
69
+ @click.option("--locale", "-l", type=str, default="en-US")
70
+ @click.option("--doi", type=str)
71
+ @click.option("--prefix", type=str)
72
+ @click.option("--depositor", type=str)
73
+ @click.option("--email", type=str)
74
+ @click.option("--registrant", type=str)
75
+ @click.option("--show-errors/--no-errors", type=bool, show_default=True, default=False)
76
+ def put(
77
+ input,
78
+ via,
79
+ to,
80
+ style,
81
+ locale,
82
+ doi,
83
+ prefix,
84
+ depositor,
85
+ email,
86
+ registrant,
87
+ show_errors,
49
88
  ):
50
89
  metadata = Metadata(input, via=via, doi=doi, prefix=prefix)
51
90
  if show_errors and not metadata.is_valid:
@@ -75,8 +114,7 @@ def convert(
75
114
  @click.option("--depositor", type=str)
76
115
  @click.option("--email", type=str)
77
116
  @click.option("--registrant", type=str)
78
- @click.option("--filename", type=str)
79
- @click.option("--jsonlines/--no-jsonlines", type=bool, show_default=True, default=False)
117
+ @click.option("--file", type=str)
80
118
  @click.option("--show-errors/--no-errors", type=bool, show_default=True, default=False)
81
119
  @click.option("--show-timer/--no-timer", type=bool, show_default=True, default=False)
82
120
  def list(
@@ -89,8 +127,69 @@ def list(
89
127
  depositor,
90
128
  email,
91
129
  registrant,
92
- filename,
93
- jsonlines,
130
+ file,
131
+ show_errors,
132
+ show_timer,
133
+ ):
134
+ start = time.time()
135
+ metadata_list = MetadataList(
136
+ string,
137
+ via=via,
138
+ file=file,
139
+ depositor=depositor,
140
+ email=email,
141
+ registrant=registrant,
142
+ prefix=prefix,
143
+ )
144
+ end = time.time()
145
+ runtime = end - start
146
+ if show_errors and not metadata_list.is_valid:
147
+ raise click.ClickException(str(metadata_list.errors))
148
+ if file:
149
+ metadata_list.write(to=to, style=style, locale=locale)
150
+ else:
151
+ click.echo(metadata_list.write(to=to, style=style, locale=locale))
152
+
153
+ if show_errors and len(metadata_list.write_errors) > 0:
154
+ raise click.ClickException(str(metadata_list.write_errors))
155
+ if show_timer:
156
+ click.echo(f"Runtime: {runtime:.2f} seconds")
157
+
158
+
159
+ @cli.command()
160
+ @click.argument("string", type=str, required=True)
161
+ @click.option("--via", "-f", type=str)
162
+ @click.option("--to", "-t", type=str, default="commonmeta")
163
+ @click.option("--style", "-s", type=str, default="apa")
164
+ @click.option("--locale", "-l", type=str, default="en-US")
165
+ @click.option("--prefix", type=str)
166
+ @click.option("--depositor", type=str)
167
+ @click.option("--email", type=str)
168
+ @click.option("--registrant", type=str)
169
+ @click.option("--login_id", type=str)
170
+ @click.option("--login_passwd", type=str)
171
+ @click.option("--host", type=str)
172
+ @click.option("--token", type=str)
173
+ @click.option("--legacy-key", type=str)
174
+ @click.option("--file", type=str)
175
+ @click.option("--show-errors/--no-errors", type=bool, show_default=True, default=False)
176
+ @click.option("--show-timer/--no-timer", type=bool, show_default=True, default=False)
177
+ def push(
178
+ string,
179
+ via,
180
+ to,
181
+ style,
182
+ locale,
183
+ prefix,
184
+ depositor,
185
+ email,
186
+ registrant,
187
+ login_id,
188
+ login_passwd,
189
+ host,
190
+ token,
191
+ legacy_key,
192
+ file,
94
193
  show_errors,
95
194
  show_timer,
96
195
  ):
@@ -98,18 +197,22 @@ def list(
98
197
  metadata_list = MetadataList(
99
198
  string,
100
199
  via=via,
200
+ file=file,
101
201
  depositor=depositor,
102
202
  email=email,
103
203
  registrant=registrant,
204
+ login_id=login_id,
205
+ login_passwd=login_passwd,
206
+ host=host,
207
+ token=token,
104
208
  prefix=prefix,
105
- filename=filename,
106
- jsonlines=jsonlines,
107
209
  )
108
210
  end = time.time()
109
211
  runtime = end - start
110
212
  if show_errors and not metadata_list.is_valid:
111
213
  raise click.ClickException(str(metadata_list.errors))
112
- click.echo(metadata_list.write(to=to, style=style, locale=locale))
214
+
215
+ click.echo(metadata_list.push(to=to, style=style, locale=locale))
113
216
  if show_errors and len(metadata_list.write_errors) > 0:
114
217
  raise click.ClickException(str(metadata_list.write_errors))
115
218
  if show_timer:
@@ -167,29 +270,6 @@ def decode(doi):
167
270
  click.echo(output)
168
271
 
169
272
 
170
- @cli.command()
171
- @click.argument("id", type=str, required=True)
172
- def encode_by_id(id):
173
- post = get_json_feed_item_uuid(id)
174
- prefix = py_.get(post, "blog.prefix")
175
- if validate_prefix(prefix) is None:
176
- return None
177
- output = encode_doi(prefix)
178
- click.echo(output)
179
-
180
-
181
- @cli.command()
182
- @click.argument("filter", type=str, required=True, default="unregistered")
183
- @click.option("--id", type=str)
184
- def json_feed(filter, id=None):
185
- if filter == "blog_slug" and id is not None:
186
- post = get_json_feed_item_uuid(id)
187
- output = py_.get(post, "blog.slug", "no slug found")
188
- else:
189
- output = "no filter specified"
190
- click.echo(output)
191
-
192
-
193
273
  @cli.command()
194
274
  @click.argument("id", type=str, required=True)
195
275
  @click.option("--api-key", "-k", type=str, required=True)
commonmeta/constants.py CHANGED
@@ -190,6 +190,7 @@ CM_TO_CR_TRANSLATIONS = {
190
190
  "JournalIssue": "JournalIssue",
191
191
  "JournalVolume": "JournalVolume",
192
192
  "Journal": "Journal",
193
+ "PeerReview": "PeerReview",
193
194
  "ProceedingsArticle": "ProceedingsArticle",
194
195
  "ProceedingsSeries": "ProceedingsSeries",
195
196
  "Proceedings": "Proceedings",
@@ -698,3 +699,22 @@ ROR_TO_CROSSREF_FUNDER_ID_TRANSLATIONS = {
698
699
  "https://ror.org/00yjd3n13": "https://doi.org/10.13039/501100001711",
699
700
  "https://ror.org/04wfr2810": "https://doi.org/10.13039/501100003043",
700
701
  }
702
+
703
+ COMMUNITY_TRANSLATIONS = {
704
+ "ai": "artificialintelligence",
705
+ "llms": "artificialintelligence",
706
+ "book%20review": "bookreview",
707
+ "bjps%20review%20of%20books": "bookreview",
708
+ "books": "bookreview",
709
+ "nachrichten": "news",
710
+ "opencitations": "researchassessment",
711
+ "papers": "researchblogging",
712
+ "urheberrecht": "copyright",
713
+ "workshop": "events",
714
+ "veranstaltungen": "events",
715
+ "veranstaltungshinweise": "events",
716
+ "asapbio": "preprints",
717
+ "biorxiv": "preprints",
718
+ "runiverse": "r",
719
+ "bericht": "report",
720
+ }
@@ -0,0 +1,112 @@
1
+ """File utils module for commonmeta-py"""
2
+
3
+ import gzip
4
+ import io
5
+ import zipfile
6
+ from pathlib import Path
7
+ from typing import Optional, Union
8
+
9
+ import requests
10
+
11
+
12
+ def read_file(filename: str) -> bytes:
13
+ with open(filename, "rb") as f:
14
+ return f.read()
15
+
16
+
17
+ def uncompress_content(input: bytes) -> bytes:
18
+ with gzip.GzipFile(fileobj=io.BytesIO(input)) as gz:
19
+ return gz.read()
20
+
21
+
22
+ def unzip_content(input: bytes, filename: Optional[str] = None) -> bytes:
23
+ output = b""
24
+ with zipfile.ZipFile(io.BytesIO(input)) as zf:
25
+ for info in zf.infolist():
26
+ if filename and info.filename != filename:
27
+ continue
28
+ with zf.open(info) as file:
29
+ output += file.read()
30
+ return output
31
+
32
+
33
+ def read_gz_file(filename: str) -> bytes:
34
+ input_bytes = read_file(filename)
35
+ return uncompress_content(input_bytes)
36
+
37
+
38
+ def read_zip_file(filename: str, name: Optional[str] = None) -> bytes:
39
+ input_bytes = read_file(filename)
40
+ return unzip_content(input_bytes, name)
41
+
42
+
43
+ def download_file(url: str) -> bytes:
44
+ resp = requests.get(url, stream=True)
45
+ resp.raise_for_status()
46
+ return resp.content
47
+ # # Progress bar
48
+ # total = int(resp.headers.get("content-length", 0))
49
+
50
+ # buf = io.BytesIO()
51
+ # with tqdm(total=total, unit="B", unit_scale=True, desc="downloading") as bar:
52
+ # for chunk in resp.iter_content(chunk_size=8192):
53
+ # if chunk:
54
+ # buf.write(chunk)
55
+ # bar.update(len(chunk))
56
+ # return buf.getvalue()
57
+
58
+
59
+ def write_file(filename: str, output: bytes) -> None:
60
+ with open(filename, "xb") as f:
61
+ f.write(output)
62
+
63
+
64
+ def write_gz_file(filename: str, output: bytes) -> None:
65
+ with gzip.open(filename, "xb") as gzfile:
66
+ gzfile.write(output)
67
+
68
+
69
+ def write_zip_file(filename: str, output: bytes) -> None:
70
+ path = Path(filename)
71
+ with zipfile.ZipFile(filename, "w", zipfile.ZIP_DEFLATED) as zipf:
72
+ zipf.writestr(path.name, output)
73
+
74
+
75
+ def get_extension(filename: str) -> tuple[str, str, Optional[str]]:
76
+ """Extract extension and compression from filename"""
77
+ extension = Path(filename).suffix
78
+ if extension == ".gz":
79
+ compress = ".gz"
80
+ filename = filename[:-3]
81
+ extension = Path(filename).suffix
82
+ elif extension == ".zip":
83
+ compress = ".zip"
84
+ filename = filename[:-4]
85
+ extension = Path(filename).suffix
86
+ elif extension == "":
87
+ compress = None
88
+ filename = filename + ".json"
89
+ extension = ".json"
90
+ else:
91
+ compress = None
92
+ return filename, extension, compress
93
+
94
+
95
+ def write_output(filename: str, input: Union[bytes, str], ext: list[str]) -> None:
96
+ """Write output to file with supported extension"""
97
+
98
+ # Convert string to bytes if necessary
99
+ if isinstance(input, str):
100
+ input = input.encode("utf-8")
101
+
102
+ filename, extension, compress = get_extension(filename)
103
+ if extension not in ext:
104
+ raise ValueError(
105
+ f"File format not supported. Please provide a filename with {ext} extension."
106
+ )
107
+ if compress == ".gz":
108
+ write_gz_file(filename + compress, input)
109
+ elif compress == ".zip":
110
+ write_zip_file(filename + compress, input)
111
+ else:
112
+ write_file(filename, input)