commonmeta-ruby 3.3.18 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +33 -29
  3. data/lib/commonmeta/crossref_utils.rb +22 -0
  4. data/lib/commonmeta/readers/json_feed_reader.rb +19 -1
  5. data/lib/commonmeta/schema_utils.rb +1 -1
  6. data/lib/commonmeta/version.rb +1 -1
  7. data/resources/{commonmeta_v0.9.2.json → commonmeta_v0.9.3.json} +32 -2
  8. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/blog_post_with_non-url_id.yml +84 -18
  9. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/blogger_post.yml +42 -14
  10. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_author_name_suffix.yml +184 -55
  11. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_doi.yml +76 -15
  12. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_institutional_author.yml +33 -12
  13. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_organizational_author.yml +44 -11
  14. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_related_identifiers.yml +366 -0
  15. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_without_doi.yml +144 -11
  16. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post.yml +42 -13
  17. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post_with_anonymous_author.yml +17 -13
  18. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/substack_post_with_broken_reference.yml +557 -262
  19. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/syldavia_gazette_post_with_references.yml +76 -47
  20. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/upstream_post_with_references.yml +303 -123
  21. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/wordpress_post.yml +108 -12
  22. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/wordpress_post_with_many_references.yml +3048 -441
  23. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/wordpress_post_with_references.yml +178 -31
  24. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/wordpress_post_with_tracking_code_on_url.yml +139 -17
  25. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/another_schema_org_from_front-matter.yml +47 -48
  26. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/journal_article.yml +5 -5
  27. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/journal_article_from_datacite.yml +7 -7
  28. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_anonymous_author.yml +17 -13
  29. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_doi.yml +108 -12
  30. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_organizational_author.yml +44 -11
  31. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_relations.yml +366 -0
  32. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_upstream_blog.yml +200 -11
  33. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_with_references.yml +303 -123
  34. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/posted_content.yml +16 -16
  35. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/schema_org_from_another_science_blog.yml +17 -17
  36. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/schema_org_from_front_matter.yml +111 -113
  37. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/schema_org_from_upstream_blog.yml +64 -57
  38. data/spec/readers/json_feed_reader_spec.rb +85 -57
  39. data/spec/writers/crossref_xml_writer_spec.rb +76 -40
  40. metadata +6 -4
@@ -0,0 +1,366 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: https://rogue-scholar.org/api/posts/8a4de443-3347-4b82-b57d-e3c82b6485fc
6
+ body:
7
+ encoding: UTF-8
8
+ string: ''
9
+ headers:
10
+ Connection:
11
+ - close
12
+ Host:
13
+ - rogue-scholar.org
14
+ User-Agent:
15
+ - http.rb/5.1.1
16
+ response:
17
+ status:
18
+ code: 200
19
+ message: OK
20
+ headers:
21
+ Age:
22
+ - '0'
23
+ Cache-Control:
24
+ - public, max-age=0, must-revalidate
25
+ Content-Length:
26
+ - '9265'
27
+ Content-Type:
28
+ - application/json; charset=utf-8
29
+ Date:
30
+ - Wed, 06 Sep 2023 15:24:18 GMT
31
+ Etag:
32
+ - '"1380dhtw0b375b"'
33
+ Server:
34
+ - Vercel
35
+ Strict-Transport-Security:
36
+ - max-age=63072000
37
+ X-Matched-Path:
38
+ - "/api/posts/[[...params]]"
39
+ X-Vercel-Cache:
40
+ - MISS
41
+ X-Vercel-Id:
42
+ - fra1::iad1::4j8g8-1694013857715-6c6b9eece4e8
43
+ Connection:
44
+ - close
45
+ body:
46
+ encoding: UTF-8
47
+ string: '{"id":"8a4de443-3347-4b82-b57d-e3c82b6485fc","doi":"https://doi.org/10.53731/r79v4e1-97aq74v-ag578","url":"https://blog.front-matter.io/posts/differences-between-orcid-and-datacite-metadata","title":"Differences
48
+ between ORCID and DataCite Metadata","summary":"One of the first tasks for
49
+ DataCite in the European Commission-funded THOR project, which started in
50
+ June, was to contribute to a comparison of the ORCID and DataCite metadata
51
+ standards.","content_html":" <p><img src=\"https://blog.front-matter.io/content/images/2023/09/cat_and_dog-1.png\"
52
+ /></p><p>One of the first tasks for DataCite in the European Commission-funded
53
+ <a href=\"http://project-thor.eu/\">THOR project</a>, which started in June,
54
+ was to contribute to a comparison of the ORCID and DataCite metadata standards.
55
+ Together with ORCID, CERN, the British Library and Dryad we looked at how
56
+ contributors, organizations and artefacts - and the relations between them
57
+ - are described in the respective metadata schemata, and how they are implemented
58
+ in two example data repositories, <a href=\"http://archaeologydataservice.ac.uk/\">Archaeology
59
+ Data Service</a> and <a href=\"https://www.datadryad.org/\">Dryad Digital
60
+ Repository</a>.</p>\n<p>The focus of our work was on identifying major gaps.
61
+ Our report was finished and made publicly available last week (Fenner et al.,
62
+ <a href=\"https://blog.datacite.org/differences-between-orcid-and-datacite-metadata/#ref-https://doi.org/10.5281/ZENODO.30799\">2015</a>).
63
+ The key findings are summarized below:</p>\n<ul><li>Common Approach to Personal
64
+ Names</li><li>Standardized Contributor Roles</li><li>Standardized Relation
65
+ Types</li><li>Metadata for Organisations</li><li>Persistent Identifiers for
66
+ Projects</li><li>Harmonization of ORCID and DataCite Metadata</li></ul>\n<h3>Common
67
+ Approach to Personal Names</h3>\n<p>While a single input field for contributor
68
+ names is common, separate fields for given and family names are required for
69
+ <a href=\"http://docs.citationstyles.org/en/stable/specification.html#names\">proper
70
+ formatting of citations</a>. As long as citations to scholarly content rely
71
+ on properly formatted text rather than persistent identifiers, services holding
72
+ bibliographic information have to support these separate fields. Further work
73
+ is needed to help with the transition to separate input fields for given and
74
+ famliy names, and to handle contributors that are organizations or groups
75
+ of people.</p>\n<h3>Standardized Contributor Roles</h3>\n<p>The currently
76
+ existing vocabularies for <strong>contributor type</strong> (DataCite) and
77
+ <strong>contributor role</strong> (ORCID) provide a high-level description,
78
+ but fall short when trying to describe the author/creator contribution in
79
+ more detail. <a href=\"http://docs.casrai.org/CRediT\">Project CRediT</a>
80
+ is a multi-stakeholder initiative that has developed a common vocabulary with
81
+ 14 different contributor roles, and this vocabulary can be used to provide
82
+ this detail, e.g. who provided resources such as reagents or samples, who
83
+ did the statistical analysis, or who contributed to the methodology of a study.</p>\n<p>CRediT
84
+ is complementary to existing contributor role vocabularies such as those by
85
+ ORCID and DataCite. For contributor roles it is particularly important that
86
+ the same vocabulary is used across stakeholders, so that the roles described
87
+ in the data center can be forwarded first to DataCite, then to ORCID, and
88
+ then also to other places such as institutional repositories.</p>\n<h3>Standardized
89
+ Relation Types</h3>\n<p>Capturing relations between scholarly works such as
90
+ datasets in a standardized way is important, as these relations are used for
91
+ citations and thus the basis for many indicators of scholarly impact. Currently
92
+ used vocabularies for relation types between scholarly works, e.g. by CrossRef
93
+ and DataCite, only partly overlap. In addition we see differences in community
94
+ practices, e.g. some scholars but not others reserve the term citation for
95
+ links between two scholarly articles. The term data citation is sometimes
96
+ used for all links from scholarly works to datasets, but other times reserved
97
+ for formal citations appearing in reference lists.</p>\n<h3>Metadata for Organisations</h3>\n<p>Both
98
+ ORCID and DataCite not only provide persistent identifiers for people and
99
+ data, but they also collect metadata around these persistent identifiers,
100
+ in particular links to other identifiers. The use of persistent identifiers
101
+ for organizations lags behind the use of persistent identifiers for research
102
+ outputs and people, and more work is needed.</p>\n<h3>Persistent Identifiers
103
+ for Projects</h3>\n<p>Research projects are collaborative activities among
104
+ contributors that may change over time. Projects have a start and end date
105
+ and are often funded by a grant. The existing persistent identifier (PID)
106
+ infrastructure does support artefacts, contributors and organisations, but
107
+ there is no first-class PID support for projects. This creates a major gap
108
+ that becomes obvious when we try to describe the relationships between funders,
109
+ contributors and research outputs.</p>\n<p>Both the ORCID and DataCite metadata
110
+ support funding information, but only as direct links to contributors or research
111
+ outputs, respectively. This not only makes it difficult to exchange funding
112
+ information between DataCite and ORCID, but also fails to adequately model
113
+ the sometimes complex relationships, e.g. when multiple funders and grants
114
+ were involved in supporting a research output. We therefore not only need
115
+ persistent identifiers for projects, but also infrastructure for collecting
116
+ and aggregating links to contributors and artefacts.</p>\n<h3>Harmonization
117
+ of ORCID and DataCite Metadata</h3>\n<p>We identified significant differences
118
+ between the ORCID and DataCite metadata schema, and these differences hinder
119
+ the flow of information between the two services. Several different approaches
120
+ to overcome these differences are conceivable:</p>\n<ol><li>only use a common
121
+ subset, relying on linked persistent identifiers to get the full metadata</li><li>harmonize
122
+ the ORCID and DataCite metadata schemata</li><li>common API exchange formats
123
+ for metadata</li></ol>\n<p>The first approach is the linked open data approach,
124
+ and was designed specifically for scenarios like this. One limitation is that
125
+ it requires persistent identifiers for all relevant attributes (e.g. for every
126
+ creator/contributor in the DataCite metadata). One major objective for THOR
127
+ is therefore to increase the use of persistent identifiers, both by THOR partners,
128
+ and by the community at large.</p>\n<p>A common metadata schema between ORCID
129
+ and DataCite is neither feasible nor necessarily needed. In addition, we have
130
+ to also consider interoperability with other metadata standards (e.g. CASRAI,
131
+ OpenAIRE, COAR), and with other artifacts, such as those having CrossRef DOIs.
132
+ What is more realistic is harmonization across a limited set essential metadata.</p>\n<p>The
133
+ third approach to improve interoperability uses a common API format that includes
134
+ all the metadata that need to be exchanged, but doesn’t require the metadata
135
+ schema itself to change. This approach was <a href=\"https://www.crossref.org/blog/crossref-and-datacite-unify-support-for-http-content-negotiation/\">taken
136
+ by DataCite and CrossRef a few years ago</a> to provide metadata for DOIs
137
+ in a consistent way despite significant differences in the CrossRef and DataCite
138
+ metadata schema. Using HTTP content negotiation, metadata are provided in
139
+ a variety of formats.</p>\n<h2>References</h2>\n<p>Fenner M, Demeranville
140
+ T, Kotarski R, et al. <em>D2.1: Artefact, Contributor, And Organisation Relationship
141
+ Data Schema</em>. Zenodo; 2015. doi:<a href=\"https://doi.org/10.5281/ZENODO.30799\">10.5281/ZENODO.30799</a></p>\n<h2>Notes</h2>\n<p>This
142
+ blog post was <a href=\"https://doi.org/10.5438/bc11-cqw1\">originally published</a>
143
+ on the DataCite Blog.</p> ","published_at":1442534400,"updated_at":1693949721,"indexed_at":1693950834,"authors":[{"url":"https://orcid.org/0000-0003-1419-2405","name":"Martin
144
+ Fenner"}],"image":"https://blog.front-matter.io/content/images/2023/09/cat_and_dog-1.png","tags":["Feature"],"language":"en","reference":[{"doi":"https://doi.org/10.5281/ZENODO.30799","key":"ref1"}],"relationships":[{"url":"https://doi.org/10.5438/bc11-cqw1","type":"IsIdenticalTo"}],"blog_id":"f0m0e38","blog_name":"Front
145
+ Matter","blog_slug":"front_matter","blog":{"id":"f0m0e38","title":"Front Matter","description":"The
146
+ Front Matter Blog covers the intersection of science and technology since
147
+ 2007.","language":"en","favicon":"https://blog.front-matter.io/favicon.png","feed_url":"https://blog.front-matter.io/atom-complete/","home_page_url":"https://blog.front-matter.io","user_id":"8498eaf6-8c58-4b58-bc15-27eda292b1aa","created_at":"2023-01-02","feed_format":"application/atom+xml","license":"https://creativecommons.org/licenses/by/4.0/legalcode","generator":"Ghost
148
+ 5.52","category":"computerAndInformationSciences","prefix":"10.53731","modified_at":"2023-09-05T21:35:21+00:00","version":"https://jsonfeed.org/version/1.1","current_feed_url":"https://blog.front-matter.io/atom/","status":"active","issn":"2749-9952","backlog":0,"authors":null,"plan":"Team","slug":"front_matter","use_mastodon":true}}'
149
+ recorded_at: Wed, 06 Sep 2023 15:24:18 GMT
150
+ - request:
151
+ method: get
152
+ uri: https://doi.org/10.5281/ZENODO.30799
153
+ body:
154
+ encoding: UTF-8
155
+ string: ''
156
+ headers:
157
+ Accept:
158
+ - application/vnd.citationstyles.csl+json
159
+ Connection:
160
+ - close
161
+ Host:
162
+ - doi.org
163
+ User-Agent:
164
+ - http.rb/5.1.1
165
+ response:
166
+ status:
167
+ code: 302
168
+ message: Found
169
+ headers:
170
+ Date:
171
+ - Wed, 06 Sep 2023 15:24:18 GMT
172
+ Content-Type:
173
+ - text/html;charset=utf-8
174
+ Content-Length:
175
+ - '183'
176
+ Connection:
177
+ - close
178
+ Location:
179
+ - https://data.crosscite.org/10.5281%2FZENODO.30799
180
+ Vary:
181
+ - Accept
182
+ Expires:
183
+ - Wed, 06 Sep 2023 15:30:19 GMT
184
+ Permissions-Policy:
185
+ - interest-cohort=(),browsing-topics=()
186
+ Cf-Cache-Status:
187
+ - DYNAMIC
188
+ Report-To:
189
+ - '{"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v3?s=ywuzQP8eruRViS7nR5D41YCRcKWTPwZHjxfQqcZmapzy6n67XHuN9%2FWJ4lFZrQ2WVIObmrL0noWrAFwYWf3TwVrW41J8YL6I3XOcyDg3B7dOcY18ckRh%2Ba4%3D"}],"group":"cf-nel","max_age":604800}'
190
+ Nel:
191
+ - '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}'
192
+ Strict-Transport-Security:
193
+ - max-age=31536000; includeSubDomains; preload
194
+ Server:
195
+ - cloudflare
196
+ Cf-Ray:
197
+ - 8027b7d8dcf3bbb5-FRA
198
+ Alt-Svc:
199
+ - h3=":443"; ma=86400
200
+ body:
201
+ encoding: UTF-8
202
+ string: |-
203
+ <html><head><title>Handle Redirect</title></head>
204
+ <body><a href="https://data.crosscite.org/10.5281%2FZENODO.30799">https://data.crosscite.org/10.5281%2FZENODO.30799</a></body></html>
205
+ recorded_at: Wed, 06 Sep 2023 15:24:18 GMT
206
+ - request:
207
+ method: get
208
+ uri: https://data.crosscite.org/10.5281%2FZENODO.30799
209
+ body:
210
+ encoding: UTF-8
211
+ string: ''
212
+ headers:
213
+ Accept:
214
+ - application/vnd.citationstyles.csl+json
215
+ Connection:
216
+ - close
217
+ User-Agent:
218
+ - http.rb/5.1.1
219
+ Host:
220
+ - data.crosscite.org
221
+ response:
222
+ status:
223
+ code: 200
224
+ message: OK
225
+ headers:
226
+ Date:
227
+ - Wed, 06 Sep 2023 15:24:19 GMT
228
+ Content-Type:
229
+ - application/vnd.citationstyles.csl+json; charset=utf-8
230
+ Transfer-Encoding:
231
+ - chunked
232
+ Connection:
233
+ - close
234
+ Status:
235
+ - 200 OK
236
+ Cache-Control:
237
+ - max-age=0, private, must-revalidate
238
+ Vary:
239
+ - Accept-Encoding, Origin
240
+ Etag:
241
+ - W/"808492baef8e3e4af0fa6812bc25a08c"
242
+ X-Runtime:
243
+ - '0.018492'
244
+ X-Request-Id:
245
+ - 38c1416b-6a7a-4f67-8ff5-5ecadad0191b
246
+ X-Powered-By:
247
+ - Phusion Passenger(R) 6.0.13
248
+ Server:
249
+ - nginx/1.18.0 + Phusion Passenger(R) 6.0.13
250
+ body:
251
+ encoding: UTF-8
252
+ string: |-
253
+ {
254
+ "type": "report",
255
+ "id": "https://doi.org/10.5281/zenodo.30799",
256
+ "categories": [
257
+ "persistent identifier",
258
+ "pid",
259
+ "orcid",
260
+ "doi",
261
+ "datacite"
262
+ ],
263
+ "author": [
264
+ {
265
+ "family": "Fenner",
266
+ "given": "Martin"
267
+ },
268
+ {
269
+ "family": "Demeranville",
270
+ "given": "Tom"
271
+ },
272
+ {
273
+ "family": "Kotarski",
274
+ "given": "Rachael"
275
+ },
276
+ {
277
+ "family": "Vision",
278
+ "given": "Todd"
279
+ },
280
+ {
281
+ "family": "Rueda",
282
+ "given": "Laura"
283
+ },
284
+ {
285
+ "family": "Dasler",
286
+ "given": "Robin"
287
+ },
288
+ {
289
+ "family": "Haak",
290
+ "given": "Laure"
291
+ },
292
+ {
293
+ "family": "Cruse",
294
+ "given": "Patricia"
295
+ }
296
+ ],
297
+ "issued": {
298
+ "date-parts": [
299
+ [
300
+ 2015,
301
+ 9,
302
+ 11
303
+ ]
304
+ ]
305
+ },
306
+ "abstract": "This document identifies gaps in existing PID infrastructures, with a focus on ORCID and DataCite Metadata and links between contributors, organizations and artefacts. What prevents us from establishing interoperability and overcoming barriers between PID platforms for contributors, artefacts and organisations, and research solutions for federated attribution, claiming, publishing and direct data access? It goes on to propose strategies to overcome these gaps.",
307
+ "DOI": "10.5281/ZENODO.30799",
308
+ "publisher": "Zenodo",
309
+ "title": "D2.1: Artefact, Contributor, And Organisation Relationship Data Schema",
310
+ "URL": "https://zenodo.org/record/30799",
311
+ "copyright": "Creative Commons Attribution 4.0"
312
+ }
313
+ recorded_at: Wed, 06 Sep 2023 15:24:19 GMT
314
+ - request:
315
+ method: head
316
+ uri: https://doi.org/10.5438/bc11-cqw1
317
+ body:
318
+ encoding: UTF-8
319
+ string: ''
320
+ headers:
321
+ Connection:
322
+ - close
323
+ Host:
324
+ - doi.org
325
+ User-Agent:
326
+ - http.rb/5.1.1
327
+ response:
328
+ status:
329
+ code: 302
330
+ message: Found
331
+ headers:
332
+ Date:
333
+ - Wed, 06 Sep 2023 15:24:20 GMT
334
+ Content-Type:
335
+ - text/html;charset=utf-8
336
+ Content-Length:
337
+ - '233'
338
+ Connection:
339
+ - close
340
+ Location:
341
+ - https://datacite.org/blog/differences-between-orcid-and-datacite-metadata/
342
+ Vary:
343
+ - Accept
344
+ Expires:
345
+ - Wed, 06 Sep 2023 16:22:17 GMT
346
+ Permissions-Policy:
347
+ - interest-cohort=(),browsing-topics=()
348
+ Cf-Cache-Status:
349
+ - DYNAMIC
350
+ Report-To:
351
+ - '{"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v3?s=%2FxKaKe0Uqd0w7P2s8Rg2q0OaJky3ejizEc%2BnC3S5rLrMjs4VmA56z2ZgoixlwuaCGwE7FO2c2NtawI2T%2FH5j7ysYCJ%2BSe4k2V56KAA0oie0B0YP8OUYGtYg%3D"}],"group":"cf-nel","max_age":604800}'
352
+ Nel:
353
+ - '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}'
354
+ Strict-Transport-Security:
355
+ - max-age=31536000; includeSubDomains; preload
356
+ Server:
357
+ - cloudflare
358
+ Cf-Ray:
359
+ - 8027b7e05a735bf5-FRA
360
+ Alt-Svc:
361
+ - h3=":443"; ma=86400
362
+ body:
363
+ encoding: UTF-8
364
+ string: ''
365
+ recorded_at: Wed, 06 Sep 2023 15:24:20 GMT
366
+ recorded_with: VCR 6.2.0
@@ -23,13 +23,13 @@ http_interactions:
23
23
  Cache-Control:
24
24
  - public, max-age=0, must-revalidate
25
25
  Content-Length:
26
- - '1836'
26
+ - '17466'
27
27
  Content-Type:
28
28
  - application/json; charset=utf-8
29
29
  Date:
30
- - Tue, 11 Jul 2023 06:03:27 GMT
30
+ - Wed, 06 Sep 2023 14:50:20 GMT
31
31
  Etag:
32
- - '"xxryc79e8o1f0"'
32
+ - '"v3f9t3mndwdfq"'
33
33
  Server:
34
34
  - Vercel
35
35
  Strict-Transport-Security:
@@ -39,7 +39,7 @@ http_interactions:
39
39
  X-Vercel-Cache:
40
40
  - MISS
41
41
  X-Vercel-Id:
42
- - fra1::iad1::w29f8-1689055406778-6dd79d460a8c
42
+ - fra1::iad1::jghz2-1694011819953-8831a8905b67
43
43
  Connection:
44
44
  - close
45
45
  body:
@@ -49,12 +49,201 @@ http_interactions:
49
49
  subject classification was done manually at varying levels of granularity,
50
50
  depending on the use case for the institution. Subject classification is done
51
51
  to help collate resources by subject enabling the user to discover publications
52
- based on different levels of subject specificity. It can also be used to help
53
- determine where to publish and the direction a particular author may be pursuing
54
- in their research if one wants to track where their work is being published....","published_at":1684834305,"updated_at":1687862553,"indexed_at":1688982864,"authors":[{"url":"https://orcid.org/0000-0001-9165-2757","name":"Esha
52
+ based on different levels of subject specificity.","content_html":" <p><img
53
+ src=\"https://upstream.force11.org/content/images/2023/05/esha-subject-blog.jpg\"
54
+ /></p><p>Traditionally, journal subject classification was done manually at
55
+ varying levels of granularity, depending on the use case for the institution.
56
+ Subject classification is done to help collate resources by subject enabling
57
+ the user to discover publications based on different levels of subject specificity.
58
+ It can also be used to help determine where to publish and the direction a
59
+ particular author may be pursuing in their research if one wants to track
60
+ where their work is being published. Currently, most subject classification
61
+ is done manually as it is a speciality that requires a lot of training. However,
62
+ this effort can be siloed by institution or can be hampered by various inter-institutional
63
+ agreements that prevent other resources from being classified. It could also
64
+ prevent a standardized approach to classifying items if different publications
65
+ in separate institutions use different taxonomies and classification systems.
66
+ Automating classification work surfaces questions about the relevance of the
67
+ taxonomy used, the potential bias that might exist, and the texts being classified.
68
+ Currently, journals are classified using various taxonomies and are siloed
69
+ in many systems, such as library databases or software for publishers. Providing
70
+ a service that can automatically classify a text (and provide a measure of
71
+ accuracy!) outside of a specific system can democratize access to this information
72
+ across all systems. Crossref infrastructure enables a range of services for
73
+ the research community; we have a wealth of metadata created by a very large
74
+ global community. We wondered how we could contribute in this area.</p><p>In
75
+ our own metadata corpus, we had subject classifications for a subset of our
76
+ journals provided by Elsevier. However, this meant that we were providing
77
+ subject information unevenly across our metadata. We wondered if we could
78
+ extrapolate the information and provide the data across all our metadata.</p><p>We
79
+ looked specifically at journal-level classification instead of article-level
80
+ classification for a few reasons. We had the training data for journal-level
81
+ subject classification; it was a good place to begin understanding what would
82
+ be needed. Our work so far provides a foundation for further article-level
83
+ classification - if Crossref decides to investigate further.</p><p>To start
84
+ with, I used Elsevier’s All Science Journal Classification Codes (<a href=\"https://service.elsevier.com/app/answers/detail/a_id/15181/supporthub/scopus/\">ASJC</a>),
85
+ which have been applied to their <a href=\"https://www.elsevier.com/solutions/scopus/how-scopus-works/content\">database</a>
86
+ of publications, which includes journals and books. We used ASJC because it
87
+ contained metadata that could be parsed programmatically. If the project progressed
88
+ well, we felt that we could look at other classification systems.</p><p>After
89
+ pre-processing, three methods (tf-idf, Embeddings, LLM) were used, and their
90
+ performances were benchmarked. The following outlines the steps taken for
91
+ the pre-processing, cleaning, and implementation details of the methods used
92
+ to predict the subject classification of journals.</p><h3>Pre-processing of
93
+ data</h3><p>The Excel document was processed as a CSV file and has various
94
+ information, including journal titles, the corresponding print and e- ISSNs,
95
+ and their ASJC codes. The journals were mostly in English but were also in
96
+ many other languages, such as Russian, Italian, Spanish, Chinese, and others.
97
+ First, there was a process to see which journals in the Elsevier list also
98
+ existed in the Crossref corpus. As of June 2022, there were 26,000 journals
99
+ covered by the Elsevier database. The journals could contain one or many subject
100
+ categories. For example, the <em>Journal of Children’s Services</em> has several
101
+ subjects assigned to them, such as Law, Sociology and Political Science, Education,
102
+ and Health. The journal titles have some data, but not a lot. They averaged
103
+ about four words per title, so more data was needed. First, 10 - 20 journal
104
+ article titles per journal were added if there were that many journal articles
105
+ available. At Crossref, a few journal articles contain abstracts, but not
106
+ all. So, for the moment, journal titles and their corresponding article titles
107
+ were the additional data points that were used.</p><h5><strong>Cleaning the
108
+ data</strong></h5><p>The data was cleaned up to remove stop words, various
109
+ types of formulae, and XML from the titles. Stop words generally consist of
110
+ articles, pronouns, conjunctions, and other frequently used words. The <a
111
+ href=\"https://github.com/stopwords-iso/stopwords-iso\">stop words list</a>
112
+ of all languages in the ISO-639 standard was used to process the titles. Some
113
+ domain-specific terms to the stop words, such as “journal”, “archive”, “book”,
114
+ “studies”, and so on, were also added to the list. Formulae and XML tags were
115
+ removed with regular expressions. Rare subject categories that were assigned
116
+ to very few journals (less than 50 out of 26000 journals)  were also removed.
117
+ The cleaned data was now ready for processing. It was split into training,
118
+ validation, and test sets.</p><h3>Methods</h3><p>This particular type of classification
119
+ is known as a multi-label classification problem since zero, or many subjects
120
+ can be assigned to a journal. Three methods were used to see which performed
121
+ best.</p><h4><strong>TF-IDF + Linear Support Vector Classification</strong></h4><p>The
122
+ first approach used the tf-idf and multilabel binarizer libraries from <a
123
+ href=\"https://scikit-learn.org/stable/index.html\">scikit learn</a>. <a href=\"https://en.wikipedia.org/wiki/Tf%E2%80%93idf\">Tf-idf</a>
124
+ is a numerical statistic that is intended to reflect how important a word
125
+ is to a document in a collection. Using tf-idf, a  number of different strategies
126
+ that can be used within a multi-label classification problem were benchmarked.
127
+ The tf-idf vectorizer and multilabel binarizer are Python libraries that convert
128
+ data into machine parseable vectors. Essentially, the data is a table of journal
129
+ and article titles and their corresponding subjects.</p><p>A baseline prediction
130
+ was needed to benchmark the performance of the strategies used. This prediction
131
+ was made by comparing the presence of the subject codes assigned to the journal
132
+ with the most common subject codes present in the corpus. The measure used
133
+ to compare the performances was the micro <a href=\"https://en.wikipedia.org/wiki/F-score\">F1</a>
134
+ score. The micro F1 score of the baseline prediction was 0.067. It shows that
135
+ applying a naive approach will provide a prediction at 6.67% accuracy. That
136
+ measure provided a good starting point to get an idea of the performance of
137
+ subsequent methods.</p><p>Among the strategies used, the best-performing strategy
138
+ was One vs Rest using LinearSVC. The micro F1 score was 0.43 after processing
139
+ 20,000 features using the validation dataset. This was a decent increase from
140
+ the baseline; however, it is still not very serviceable. In order to improve
141
+ performance, it was decided to reduce the granularity of subjects. For example,
142
+ the journal, <em>Journal of Children’s Services,</em> has several subjects
143
+ assigned to them, such as Law, Sociology and Political Science'', Education,
144
+ and Health. Elsevier’s ASJC subjects are in hierarchies. There are several
145
+ subgroups of fields within some overarching fields. For example, the group,
146
+ Medicine, has several specialities of medicine listed under it. The subjects,
147
+ Social Sciences and Psychology work similarly. They are two separate fields
148
+ of study, and the journal has articles that apply to either or both fields
149
+ of study. The subjects listed in the  <em>Journal of Children’s Services </em>are
150
+ in two different groups: Social Sciences and Psychology. Downgrading the granularity
151
+ makes the learning process a little simpler. So, instead of the  <em>Journal
152
+ of Children’s Services </em>belonging to several different subjects, the journal
153
+ now belonged to two subjects. Using the same strategy, one vs rest with LinearSVC,
154
+ we get an F1 score of 0.72 for the same number of titles. This was a marked
155
+ improvement from before. There were other avenues that could be looked at,
156
+ such as bringing in more data in the form of references, but there were also
157
+ other methods to look at. We were curious about the role of embeddings and
158
+ decided to pursue that approach.</p><h4><strong>Embeddings + Linear Support
159
+ Vector Classification</strong></h4><p>This approach is slightly different
160
+ from the tf-idf approach. For the titles, we decided to use a model that was
161
+ already trained on a scientific corpus. For this, AllenAI’s <a href=\"https://github.com/allenai/scibert\">SciBERT</a>
162
+ was used, a fine-tuned <a href=\"https://arxiv.org/abs/1810.04805\">BERT</a>
163
+ model trained on papers from the corpus of <a href=\"https://semanticscholar.org\">semanticscholar.org</a>;
164
+ a tool provided by AllenAI. The model provides an embedding: a vector representation
165
+ of the titles, based on the data it has already been trained on. This allows
166
+ it to provide more semantic weight on the data rather than simple occurrence
167
+ of the words in the document (this occurs with the previous method, tf-idf).
168
+ The generation of the embedding took over 18 hours on a laptop, but after
169
+ that, generating predictions became quite fast. The amount of data needed
170
+ to generate this vector is also lower than the tf-idf generation. The subjects
171
+ were processed similarly to before and generated a vector using the multilabel
172
+ binarizer. With 512 features from the titles (instead of 20,000) in the previous
173
+ approach, the same strategy was used as earlier. Using the one vs rest strategy
174
+ with LinearSVC the strategy was run against the validation set and got a F1
175
+ score of 0.71. </p><p>So far, the tally is:</p><table>\n<thead>\n<tr>\n<th>Method</th>\n<th>F1
176
+ Score</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>Tf-idf + multilabel binarizer</td>\n<td>0.73</td>\n</tr>\n<tr>\n<td>SciBERT
177
+ embedding + multilabel binarizer</td>\n<td>0.71</td>\n</tr>\n</tbody>\n</table>\n<p>At
178
+ this point, we were going to look into gathering more data points such as
179
+ references and run a comparison between these two methods. However, large
180
+ language models, especially ChatGPT, came into the zeitgeist, a few weeks
181
+ into mulling over other options.</p><h4><strong>OpenAI: LLM + sentence completion</strong></h4><p>Out
182
+ of curiosity, the author looked to see what chatGPT could do. ChatGPT was
183
+ asked to figure out what topics an existing journal title belonged to, and
184
+ it came very close to predicting the correct answer. The author also asked
185
+ it to figure out to which topic multiple Dutch journal article titles belonged,
186
+ and it predicted the correct answer again. The author decided to investigate
187
+ this avenue knowing that if there were good results, open large language models
188
+ would be used to see if there would be comparable results. The screenshot
189
+ below shows the examples listed above.</p><figure><img src=\"https://upstream.force11.org/content/images/2023/08/openai_experiment.png\"
190
+ loading=\"lazy\" width=\"1600\" height=\"1495\" srcset=\"https://upstream.force11.org/content/images/size/w600/2023/08/openai_experiment.png
191
+ 600w, https://upstream.force11.org/content/images/size/w1000/2023/08/openai_experiment.png
192
+ 1000w, https://upstream.force11.org/content/images/2023/08/openai_experiment.png
193
+ 1600w\" /></figure><p>Subjects had to be processed a little differently for
194
+ this model. The ASJC codes have subjects in text form as well as numerical
195
+ values. For example, if there is a journal classified as “Medicine”, it has
196
+ a code of “27”. The author fine-tuned the openAI model using their “ada” model
197
+   (it is the fastest and the cheapest) and sent it some sentence completion
198
+ prompts. Essentially, this means that the model is being fine-tuned into telling
199
+ it what subject codes it needs to complete the sentences that it is being
200
+ sent. So, suppose several different titles are sent to the model and asked
201
+ to complete it with several delimited subject codes. In that case, the model
202
+ should be able to predict which subject codes should complete the sentences.
203
+ A set of prompts were created with the journal titles and their corresponding
204
+ subject codes as the sentence completion prompt to train the model. It looked
205
+ like this:</p><p><strong><code>{\"prompt\":\"Lower Middle Ordovician carbon
206
+ and oxygen…..,\"completion\":\" 11\\n19\"}</code></strong></p><p>The above
207
+ snippet has several different titles where the subjects assigned to these
208
+ titles are 11 and 19, which are <em>Agricultural and Biological Sciences</em>
209
+ and<em> Earth and Planetary Sciences,</em> respectively.</p><p>The openAI’s
210
+ API was used to fine-tune and train a model using the above prompts, and $10.00
211
+ later, generated a model.</p><figure><img src=\"https://upstream.force11.org/content/images/2023/08/data-src-image-60e0df22-f6e0-4c81-adf0-fe21d2839897.png\"
212
+ loading=\"lazy\" width=\"1600\" height=\"702\" srcset=\"https://upstream.force11.org/content/images/size/w600/2023/08/data-src-image-60e0df22-f6e0-4c81-adf0-fe21d2839897.png
213
+ 600w, https://upstream.force11.org/content/images/size/w1000/2023/08/data-src-image-60e0df22-f6e0-4c81-adf0-fe21d2839897.png
214
+ 1000w, https://upstream.force11.org/content/images/2023/08/data-src-image-60e0df22-f6e0-4c81-adf0-fe21d2839897.png
215
+ 1600w\" /></figure><p>The validation dataset was run against the model and
216
+ got a micro F1 score of 0.69. So, the tally now is:</p><table>\n<thead>\n<tr>\n<th>Method</th>\n<th>F1
217
+ Score</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>Tf-idf + multilabel binarizer</td>\n<td>0.73</td>\n</tr>\n<tr>\n<td>SciBERT
218
+ embedding + multilabel binarizer</td>\n<td>0.71</td>\n</tr>\n<tr>\n<td>ChatGPT
219
+ + sentence completion</td>\n<td>0.69</td>\n</tr>\n</tbody>\n</table>\n<h3>Summary</h3><p>So,
220
+ sad trombone, using three different methods, the F1 score is similar across
221
+ all three methods. Essentially, we needed more data for more accurate predictions.
222
+ Crossref has abstracts for a subset of the deposited publication metadata.
223
+ Therefore, this data could not be used at this time for comparison. However,
224
+ having that data could possibly yield better results. The only way to do that
225
+ is to use a similar method to get those results. We do not have that currently,
226
+ and so, for now,  it becomes a chicken and egg thought exercise. Getting even
227
+ more data, such as full-text, could also produce interesting results, but
228
+ we do not have the data for that either. For now, Crossref decided to remove
229
+ the existing subject classifications that were present in some of our metadata.
230
+ We could revisit the problem later - if we have more data. There are certainly
231
+ interesting applications of these methods. We could:</p><ol><li>Look into
232
+ topic clustering across our metadata and see what surfaces. This could also
233
+ have applications in looking at the research zeitgeist across various time
234
+ periods.</li><li>Measure the similarities of embeddings with each other to
235
+ look at article similarities, which could yield interesting results in recommendations
236
+ and search.<br /></li></ol><p>Automated subject classification also raises
237
+ questions about fairness and bias in its algorithms and training and validation
238
+ data. It would also be productive to clearly understand how the algorithm
239
+ reaches its conclusions. Therefore, any automated system must be thoroughly
240
+ tested, and anyone using it should have a very good understanding of what
241
+ is happening within the algorithm.</p><p>This was an interesting exercise
242
+ for the author to get acquainted with machine learning and become familiar
243
+ with some of the available techniques.</p><p></p> ","published_at":1684834305,"updated_at":1691141202,"indexed_at":1691141631,"authors":[{"url":"https://orcid.org/0000-0001-9165-2757","name":"Esha
55
244
  Datta"}],"image":"https://upstream.force11.org/content/images/2023/05/esha-subject-blog.jpg","tags":["Original
56
- Research"],"language":"en","reference":[],"blog_id":"pm0p222","blog_name":"Upstream","blog":{"id":"pm0p222","title":"Upstream","description":"The
57
- community blog for all things Open Research.","language":"en","favicon":"https://upstream.force11.org/favicon.png","feed_url":"https://upstream.force11.org/atom-complete/","home_page_url":"https://upstream.force11.org","user_id":"8498eaf6-8c58-4b58-bc15-27eda292b1aa","created_at":"2023-05-31T07:23:49+00:00","indexed_at":"2023-01-13","feed_format":"application/atom+xml","license":"https://creativecommons.org/licenses/by/4.0/legalcode","generator":"Ghost
58
- 5.25","category":"Humanities","prefix":"10.54900","modified_at":"2023-07-04T21:15:51+00:00","version":"https://jsonfeed.org/version/1.1","backlog":false,"current_feed_url":"https://upstream.force11.org/atom/","expired":null}}'
59
- recorded_at: Tue, 11 Jul 2023 06:03:27 GMT
245
+ Research"],"language":"en","reference":[],"relationships":[],"blog_id":"pm0p222","blog_name":"Upstream","blog_slug":"upstream","blog":{"id":"pm0p222","title":"Upstream","description":"The
246
+ community blog for all things Open Research.","language":"en","favicon":"https://upstream.force11.org/favicon.png","feed_url":"https://upstream.force11.org/atom-complete/","home_page_url":"https://upstream.force11.org","user_id":"08014cf6-3335-4588-96f4-c77ac1e535b2","created_at":"2023-01-13","feed_format":"application/atom+xml","license":"https://creativecommons.org/licenses/by/4.0/legalcode","generator":"Ghost
247
+ 5.25","category":"humanities","prefix":"10.54900","modified_at":"2023-08-04T09:26:42+00:00","version":"https://jsonfeed.org/version/1.1","current_feed_url":"https://upstream.force11.org/atom/","status":"active","issn":null,"backlog":0,"authors":null,"plan":"Team","slug":"upstream","use_mastodon":false}}'
248
+ recorded_at: Wed, 06 Sep 2023 14:50:20 GMT
60
249
  recorded_with: VCR 6.2.0