commonmeta-ruby 3.5.5 → 3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +16 -5
  3. data/lib/commonmeta/readers/crossref_xml_reader.rb +1 -1
  4. data/lib/commonmeta/utils.rb +6 -6
  5. data/lib/commonmeta/version.rb +1 -1
  6. data/spec/cli_spec.rb +8 -8
  7. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref/default.yml +13 -13
  8. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref/to_bibtex.yml +13 -13
  9. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref/to_crossref_xml.yml +25 -25
  10. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref/to_datacite.yml +13 -13
  11. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref/to_schema_org.yml +13 -13
  12. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref_xml/default.yml +7 -7
  13. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref_xml/to_bibtex.yml +7 -7
  14. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref_xml/to_crossref_xml.yml +7 -59
  15. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref_xml/to_datacite.yml +7 -7
  16. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref_xml/to_schema_org.yml +7 -7
  17. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/crossref/default.yml +24 -24
  18. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/crossref/to_bibtex.yml +24 -24
  19. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/crossref/to_citation.yml +24 -24
  20. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/crossref/to_crossref_xml.yml +24 -24
  21. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/crossref/to_datacite.yml +24 -24
  22. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/crossref/to_jats.yml +24 -24
  23. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/crossref/to_schema_org.yml +24 -24
  24. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/datacite/default.yml +16 -16
  25. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/datacite/to_bibtex.yml +16 -16
  26. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/datacite/to_citation.yml +16 -16
  27. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/datacite/to_datacite.yml +16 -16
  28. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/datacite/to_jats.yml +16 -16
  29. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/datacite/to_schema_org.yml +16 -16
  30. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/schema_org/default.yml +479 -946
  31. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/schema_org/to_crossref_xml.yml +957 -1891
  32. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/schema_org/to_datacite.yml +479 -946
  33. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_from_id/schema_org/to_schema_org.yml +481 -950
  34. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/encode/by_blog.yml +5540 -968
  35. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/encode/by_blog_unknown_blog_id.yml +22 -29
  36. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/encode/by_id.yml +25 -39
  37. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/encode/by_id_unknown_uuid.yml +18 -28
  38. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/find_from_format_by_id/crossref.yml +7 -7
  39. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/find_from_format_by_id/datacite.yml +7 -7
  40. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/find_from_format_by_id/jalc.yml +7 -7
  41. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/find_from_format_by_id/kisti.yml +7 -7
  42. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/find_from_format_by_id/medra.yml +7 -7
  43. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/find_from_format_by_id/op.yml +7 -7
  44. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/json_feed/json_feed_blog_id.yml +19 -90
  45. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/json_feed/json_feed_by_blog.yml +5578 -246
  46. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/json_feed/json_feed_not_indexed.yml +13 -2201
  47. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/json_feed/json_feed_unregistered.yml +176 -72
  48. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/change_metadata_as_datacite_xml/with_data_citation.yml +16 -16
  49. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/doi_registration_agency/crossref.yml +6 -6
  50. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/doi_registration_agency/datacite.yml +6 -6
  51. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/doi_registration_agency/jalc.yml +6 -6
  52. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/doi_registration_agency/kisti.yml +6 -6
  53. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/doi_registration_agency/medra.yml +6 -6
  54. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/doi_registration_agency/not_found.yml +6 -6
  55. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/doi_registration_agency/op.yml +6 -6
  56. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/find_from_format_by_ID/crossref.yml +6 -6
  57. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/find_from_format_by_ID/crossref_doi_not_url.yml +6 -6
  58. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/find_from_format_by_ID/datacite.yml +6 -6
  59. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/find_from_format_by_ID/datacite_doi_http.yml +6 -6
  60. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/find_from_format_by_ID/unknown_DOI_registration_agency.yml +6 -6
  61. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_blog_id_for_json_feed_item_id/by_blog_post_id.yml +27 -105
  62. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_blog_id_for_json_feed_item_id/not_found.yml +20 -27
  63. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_cff_metadata/cff-converter-python.yml +51 -25
  64. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_cff_metadata/ruby-cff.yml +12 -12
  65. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_cff_metadata/ruby-cff_repository_url.yml +9 -9
  66. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_codemeta_metadata/maremma.yml +10 -10
  67. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_codemeta_metadata/metadata_reports.yml +11 -11
  68. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/DOI_with_ORCID_ID.yml +78 -78
  69. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/DOI_with_SICI_DOI.yml +76 -76
  70. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/DOI_with_data_citation.yml +35 -35
  71. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/JaLC.yml +162 -162
  72. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/KISTI.yml +131 -131
  73. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/OP.yml +75 -75
  74. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/affiliation_is_space.yml +76 -76
  75. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/another_book.yml +113 -113
  76. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/another_book_chapter.yml +74 -74
  77. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/article_id_as_page_number.yml +77 -77
  78. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/author_literal.yml +84 -84
  79. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/book.yml +77 -77
  80. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/book_chapter.yml +75 -75
  81. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/book_chapter_with_RDF_for_container.yml +73 -73
  82. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/book_oup.yml +72 -72
  83. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/component.yml +94 -94
  84. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/dataset.yml +104 -104
  85. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/dataset_usda.yml +136 -136
  86. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/date_in_future.yml +80 -80
  87. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/dissertation.yml +103 -103
  88. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/empty_given_name.yml +75 -75
  89. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/invalid_date.yml +77 -77
  90. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/journal_article.yml +76 -76
  91. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/journal_article_original_language_title.yml +73 -73
  92. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/journal_article_with.yml +128 -210
  93. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/journal_article_with_RDF_for_container.yml +74 -74
  94. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/journal_article_with_funding.yml +76 -76
  95. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/journal_issue.yml +72 -72
  96. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/mEDRA.yml +72 -72
  97. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/markup.yml +81 -81
  98. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/missing_contributor.yml +71 -71
  99. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/multiple_issn.yml +75 -75
  100. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/multiple_titles.yml +71 -71
  101. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/multiple_titles_with_missing.yml +573 -573
  102. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/not_found_error.yml +65 -65
  103. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/peer_review.yml +77 -77
  104. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/posted_content.yml +74 -74
  105. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/posted_content_copernicus.yml +76 -76
  106. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/report_osti.yml +120 -120
  107. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/vor_with_url.yml +78 -78
  108. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/yet_another_book.yml +74 -74
  109. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/yet_another_book_chapter.yml +73 -73
  110. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_raw/journal_article.yml +59 -59
  111. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/SoftwareSourceCode.yml +4 -4
  112. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/dissertation.yml +13 -13
  113. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/funding_references.yml +15 -15
  114. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/subject_scheme.yml +120 -120
  115. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_doi_prefix_for_blog/by_blog_id.yml +5540 -555
  116. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_doi_prefix_for_blog/by_blog_post_id.yml +31 -42
  117. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_doi_prefix_for_blog/by_blog_post_id_specific_prefix.yml +25 -39
  118. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed/by_blog_id.yml +5540 -247
  119. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed/not_indexed_posts.yml +14 -26
  120. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed/unregistered_posts.yml +176 -72
  121. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/archived_wordpress_post.yml +27 -95
  122. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/blog_post_with_non-url_id.yml +28 -106
  123. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/blogger_post.yml +21 -65
  124. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_author_name_suffix.yml +20 -208
  125. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_doi.yml +26 -97
  126. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_institutional_author.yml +24 -55
  127. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_organizational_author.yml +27 -70
  128. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_related_identifiers.yml +41 -143
  129. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_related_identifiers_and_funding.yml +54 -132
  130. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_with_related_identifiers_and_link_to_peer-reviewed_article.yml +304 -818
  131. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_without_doi.yml +24 -169
  132. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post.yml +24 -63
  133. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post_with_anonymous_author.yml +25 -40
  134. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/substack_post_with_broken_reference.yml +278 -591
  135. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/syldavia_gazette_post_with_references.yml +59 -101
  136. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/upstream_post_with_references.yml +135 -331
  137. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/wordpress_post.yml +24 -134
  138. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/wordpress_post_with_many_references.yml +578 -2967
  139. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/wordpress_post_with_references.yml +44 -205
  140. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/wordpress_post_with_tracking_code_on_url.yml +26 -160
  141. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_one_author/affiliation_is_space.yml +21 -21
  142. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_one_author/has_familyName.yml +15 -15
  143. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_one_author/has_name_in_display-order_with_ORCID.yml +13 -13
  144. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_one_author/name_with_affiliation_crossref.yml +16 -16
  145. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_one_author/only_familyName_and_givenName.yml +66 -61
  146. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_schema_org_metadata/BlogPosting.yml +145 -146
  147. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_schema_org_metadata/BlogPosting_with_new_DOI.yml +149 -150
  148. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_schema_org_metadata/get_schema_org_metadata_front_matter/BlogPosting.yml +114 -115
  149. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_schema_org_metadata/harvard_dataverse.yml +300 -289
  150. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_schema_org_metadata/pangaea.yml +66 -61
  151. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_schema_org_metadata/upstream_blog.yml +64 -57
  152. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_schema_org_metadata/zenodo.yml +27 -24
  153. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/handle_input/DOI_RA_not_Crossref_or_DataCite.yml +6 -6
  154. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/handle_input/unknown_DOI_prefix.yml +6 -6
  155. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/json_schema_errors/is_valid.yml +16 -16
  156. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/BlogPosting.yml +10 -10
  157. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/Dataset.yml +10 -10
  158. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/authors_with_affiliations.yml +16 -16
  159. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/climate_data.yml +10 -10
  160. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/from_schema_org.yml +145 -146
  161. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/keywords_subject_scheme.yml +8 -8
  162. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/maremma.yml +12 -12
  163. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/text.yml +8 -8
  164. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/with_data_citation.yml +16 -16
  165. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_bibtex/with_pages.yml +16 -16
  166. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_cff/Collection_of_Jupyter_notebooks.yml +13 -13
  167. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_cff/SoftwareSourceCode_Zenodo.yml +13 -13
  168. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_cff/SoftwareSourceCode_also_Zenodo.yml +8 -8
  169. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_cff/ruby-cff.yml +10 -10
  170. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_citation/Dataset.yml +10 -10
  171. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_citation/Journal_article.yml +16 -16
  172. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_citation/Journal_article_vancouver_style.yml +21 -21
  173. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_citation/Missing_author.yml +15 -15
  174. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_citation/interactive_resource_without_dates.yml +8 -8
  175. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_citation/software_w/version.yml +8 -8
  176. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_codemeta/SoftwareSourceCode_DataCite.yml +8 -8
  177. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_codemeta/SoftwareSourceCode_DataCite_check_codemeta_v2.yml +8 -8
  178. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_commonmeta/with_data_citation.yml +12 -12
  179. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/another_schema_org_from_front-matter.yml +32 -32
  180. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/journal_article.yml +5 -5
  181. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/journal_article_from_datacite.yml +5 -5
  182. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/journal_article_plos.yml +16 -16
  183. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_anonymous_author.yml +25 -40
  184. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_doi.yml +24 -134
  185. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_organizational_author.yml +27 -70
  186. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_relations.yml +41 -143
  187. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_rogue_scholar_with_relations_and_funding.yml +55 -133
  188. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_from_upstream_blog.yml +21 -224
  189. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/json_feed_item_with_references.yml +134 -330
  190. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/posted_content.yml +19 -19
  191. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/schema_org_from_another_science_blog.yml +9 -9
  192. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/schema_org_from_front_matter.yml +92 -91
  193. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/schema_org_from_upstream_blog.yml +6 -6
  194. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/Another_dataset.yml +8 -8
  195. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/BlogPosting.yml +10 -10
  196. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/BlogPosting_schema_org.yml +146 -147
  197. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/Dataset.yml +10 -10
  198. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/container_title.yml +16 -21
  199. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/interactive_resource_without_dates.yml +8 -8
  200. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/journal_article.yml +16 -16
  201. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/keywords_subject_scheme.yml +8 -8
  202. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/maremma.yml +10 -10
  203. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/missing_creator.yml +15 -15
  204. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/multiple_abstracts.yml +8 -8
  205. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/organization_author.yml +22 -22
  206. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/software.yml +8 -8
  207. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/software_w/version.yml +8 -8
  208. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/with_only_first_page.yml +16 -16
  209. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csl/with_pages.yml +16 -16
  210. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csv/climate_data.yml +10 -10
  211. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csv/maremma.yml +10 -10
  212. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csv/text.yml +8 -8
  213. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csv/with_data_citation.yml +16 -16
  214. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_csv/with_pages.yml +16 -16
  215. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_datacite/dissertation.yml +20 -20
  216. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_datacite/from_schema_org.yml +146 -147
  217. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_datacite/journal_article.yml +22 -22
  218. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_datacite/maremma.yml +10 -10
  219. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_datacite/with_ORCID_ID.yml +16 -16
  220. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_datacite/with_data_citation.yml +16 -16
  221. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/Dataset_in_schema_4_0.yml +10 -10
  222. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/Text_pass-thru.yml +8 -8
  223. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/book_chapter.yml +15 -15
  224. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/from_schema_org.yml +146 -147
  225. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/interactive_resource_without_dates.yml +8 -8
  226. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/maremma.yml +10 -10
  227. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/with_ORCID_ID.yml +16 -16
  228. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/with_data_citation.yml +16 -16
  229. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_jats_xml/with_editor.yml +17 -17
  230. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_ris/BlogPosting.yml +10 -10
  231. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_ris/BlogPosting_schema_org.yml +145 -146
  232. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_ris/Dataset.yml +10 -10
  233. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_ris/alternate_name.yml +8 -8
  234. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_ris/journal_article.yml +9 -9
  235. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_ris/keywords_with_subject_scheme.yml +8 -8
  236. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_ris/maremma.yml +10 -10
  237. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_ris/with_pages.yml +10 -10
  238. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/Another_Schema_org_JSON.yml +10 -10
  239. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/Funding.yml +13 -13
  240. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/Funding_OpenAIRE.yml +13 -13
  241. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/Schema_org_JSON.yml +8 -8
  242. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/Schema_org_JSON_Cyark.yml +13 -13
  243. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/alternate_identifiers.yml +13 -13
  244. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/data_catalog.yml +13 -13
  245. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/geo_location_box.yml +13 -13
  246. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/interactive_resource_without_dates.yml +13 -13
  247. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/journal_article.yml +16 -16
  248. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/maremma_schema_org_JSON.yml +11 -11
  249. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/series_information.yml +17 -16
  250. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/subject_scheme.yml +15 -15
  251. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_schema_org/subject_scheme_multiple_keywords.yml +13 -13
  252. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_turtle/BlogPosting.yml +10 -10
  253. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_turtle/Dataset.yml +10 -10
  254. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_turtle/journal_article.yml +16 -16
  255. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_turtle/with_pages.yml +16 -16
  256. data/spec/readers/cff_reader_spec.rb +2 -20
  257. data/spec/readers/crossref_reader_spec.rb +10 -16
  258. data/spec/readers/crossref_xml_reader_spec.rb +61 -64
  259. data/spec/readers/json_feed_reader_spec.rb +56 -56
  260. data/spec/readers/schema_org_reader_spec.rb +1 -1
  261. data/spec/utils_spec.rb +1 -1
  262. data/spec/writers/crossref_xml_writer_spec.rb +9 -8
  263. data/spec/writers/csv_writer_spec.rb +1 -1
  264. data/spec/writers/ris_writer_spec.rb +2 -2
  265. metadata +2 -11
  266. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/convert_file/crossref_xml/to_crossref_xml_refresh.yml +0 -107
  267. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/doi_prefix/doi_prefix_by_blog.yml +0 -997
  268. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/doi_prefix/doi_prefix_by_uuid.yml +0 -256
  269. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/encode/by_uuid.yml +0 -256
  270. data/spec/fixtures/vcr_cassettes/Commonmeta_CLI/encode/by_uuid_unknown_uuid.yml +0 -49
  271. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_crossref_metadata/missing_creator.yml +0 -307
  272. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_doi_prefix_for_blog/by_blog_post_uuid.yml +0 -136
  273. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_doi_prefix_for_blog/by_blog_post_uuid_specific_prefix.yml +0 -256
  274. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/write_metadata_as_crossref/book_oup.yml +0 -107
@@ -2,15 +2,15 @@
2
2
  http_interactions:
3
3
  - request:
4
4
  method: get
5
- uri: https://rogue-scholar.org/api/posts/5d14ffac-b9ac-4e20-bdc0-d9248df4e80d
5
+ uri: https://api.rogue-scholar.org/posts/5d14ffac-b9ac-4e20-bdc0-d9248df4e80d
6
6
  body:
7
- encoding: UTF-8
7
+ encoding: ASCII-8BIT
8
8
  string: ''
9
9
  headers:
10
10
  Connection:
11
11
  - close
12
12
  Host:
13
- - rogue-scholar.org
13
+ - api.rogue-scholar.org
14
14
  User-Agent:
15
15
  - http.rb/5.1.1
16
16
  response:
@@ -18,232 +18,29 @@ http_interactions:
18
18
  code: 200
19
19
  message: OK
20
20
  headers:
21
- Age:
22
- - '0'
23
- Cache-Control:
24
- - public, max-age=0, must-revalidate
25
- Content-Length:
26
- - '17466'
27
21
  Content-Type:
28
- - application/json; charset=utf-8
22
+ - application/json
23
+ Content-Length:
24
+ - '1845'
29
25
  Date:
30
- - Wed, 06 Sep 2023 14:50:20 GMT
31
- Etag:
32
- - '"v3f9t3mndwdfq"'
26
+ - Thu, 05 Oct 2023 21:22:56 GMT
33
27
  Server:
34
- - Vercel
35
- Strict-Transport-Security:
36
- - max-age=63072000
37
- X-Matched-Path:
38
- - "/api/posts/[[...params]]"
39
- X-Vercel-Cache:
40
- - MISS
41
- X-Vercel-Id:
42
- - fra1::iad1::jghz2-1694011819953-8831a8905b67
43
- Connection:
44
- - close
28
+ - Fly/e440b950 (2023-09-20)
29
+ Via:
30
+ - 1.1 fly.io
31
+ Fly-Request-Id:
32
+ - 01HC0VHSQ6KY8KWP86JAFAGBW6-fra
45
33
  body:
46
34
  encoding: UTF-8
47
- string: '{"id":"5d14ffac-b9ac-4e20-bdc0-d9248df4e80d","doi":"https://doi.org/10.54900/n6dnt-xpq48","url":"https://upstream.force11.org/attempts-at-automating-journal-subject-classification","title":"Attempts
48
- at automating journal subject classification","summary":"Traditionally, journal
49
- subject classification was done manually at varying levels of granularity,
35
+ string: '{"archive_url":null,"authors":[{"name":"Esha Datta","url":"https://orcid.org/0000-0001-9165-2757"}],"blog":{"api":true,"archive_prefix":null,"authors":null,"backlog":0,"canonical_url":null,"category":"humanities","created_at":"2023-01-13","current_feed_url":"https://upstream.force11.org/atom/","description":"The
36
+ community blog for all things Open Research.","favicon":"https://upstream.force11.org/favicon.png","feed_format":"application/atom+xml","feed_url":"https://upstream.force11.org/atom-complete/","filter":null,"funding":null,"generator":"Ghost
37
+ 5.25","home_page_url":"https://upstream.force11.org","id":"pm0p222","issn":null,"language":"en","license":"https://creativecommons.org/licenses/by/4.0/legalcode","modified_at":"2023-09-24T13:05:54+00:00","plan":"Team","prefix":"10.54900","relative_url":null,"slug":"upstream","status":"active","title":"Upstream","use_api":true,"use_mastodon":false,"user_id":"08014cf6-3335-4588-96f4-c77ac1e535b2","version":"https://jsonfeed.org/version/1.1"},"blog_name":"Upstream","blog_slug":"upstream","doi":"https://doi.org/10.54900/n6dnt-xpq48","id":"5d14ffac-b9ac-4e20-bdc0-d9248df4e80d","image":"https://upstream.force11.org/content/images/2023/05/esha-subject-blog.jpg","indexed_at":1691141631,"language":"en","published_at":1684834305,"reference":[],"relationships":[],"summary":"Traditionally,
38
+ journal subject classification was done manually at varying levels of granularity,
50
39
  depending on the use case for the institution. Subject classification is done
51
40
  to help collate resources by subject enabling the user to discover publications
52
- based on different levels of subject specificity.","content_html":" <p><img
53
- src=\"https://upstream.force11.org/content/images/2023/05/esha-subject-blog.jpg\"
54
- /></p><p>Traditionally, journal subject classification was done manually at
55
- varying levels of granularity, depending on the use case for the institution.
56
- Subject classification is done to help collate resources by subject enabling
57
- the user to discover publications based on different levels of subject specificity.
58
- It can also be used to help determine where to publish and the direction a
59
- particular author may be pursuing in their research if one wants to track
60
- where their work is being published. Currently, most subject classification
61
- is done manually as it is a speciality that requires a lot of training. However,
62
- this effort can be siloed by institution or can be hampered by various inter-institutional
63
- agreements that prevent other resources from being classified. It could also
64
- prevent a standardized approach to classifying items if different publications
65
- in separate institutions use different taxonomies and classification systems.
66
- Automating classification work surfaces questions about the relevance of the
67
- taxonomy used, the potential bias that might exist, and the texts being classified.
68
- Currently, journals are classified using various taxonomies and are siloed
69
- in many systems, such as library databases or software for publishers. Providing
70
- a service that can automatically classify a text (and provide a measure of
71
- accuracy!) outside of a specific system can democratize access to this information
72
- across all systems. Crossref infrastructure enables a range of services for
73
- the research community; we have a wealth of metadata created by a very large
74
- global community. We wondered how we could contribute in this area.</p><p>In
75
- our own metadata corpus, we had subject classifications for a subset of our
76
- journals provided by Elsevier. However, this meant that we were providing
77
- subject information unevenly across our metadata. We wondered if we could
78
- extrapolate the information and provide the data across all our metadata.</p><p>We
79
- looked specifically at journal-level classification instead of article-level
80
- classification for a few reasons. We had the training data for journal-level
81
- subject classification; it was a good place to begin understanding what would
82
- be needed. Our work so far provides a foundation for further article-level
83
- classification - if Crossref decides to investigate further.</p><p>To start
84
- with, I used Elsevier’s All Science Journal Classification Codes (<a href=\"https://service.elsevier.com/app/answers/detail/a_id/15181/supporthub/scopus/\">ASJC</a>),
85
- which have been applied to their <a href=\"https://www.elsevier.com/solutions/scopus/how-scopus-works/content\">database</a>
86
- of publications, which includes journals and books. We used ASJC because it
87
- contained metadata that could be parsed programmatically. If the project progressed
88
- well, we felt that we could look at other classification systems.</p><p>After
89
- pre-processing, three methods (tf-idf, Embeddings, LLM) were used, and their
90
- performances were benchmarked. The following outlines the steps taken for
91
- the pre-processing, cleaning, and implementation details of the methods used
92
- to predict the subject classification of journals.</p><h3>Pre-processing of
93
- data</h3><p>The Excel document was processed as a CSV file and has various
94
- information, including journal titles, the corresponding print and e- ISSNs,
95
- and their ASJC codes. The journals were mostly in English but were also in
96
- many other languages, such as Russian, Italian, Spanish, Chinese, and others.
97
- First, there was a process to see which journals in the Elsevier list also
98
- existed in the Crossref corpus. As of June 2022, there were 26,000 journals
99
- covered by the Elsevier database. The journals could contain one or many subject
100
- categories. For example, the <em>Journal of Children’s Services</em> has several
101
- subjects assigned to them, such as Law, Sociology and Political Science, Education,
102
- and Health. The journal titles have some data, but not a lot. They averaged
103
- about four words per title, so more data was needed. First, 10 - 20 journal
104
- article titles per journal were added if there were that many journal articles
105
- available. At Crossref, a few journal articles contain abstracts, but not
106
- all. So, for the moment, journal titles and their corresponding article titles
107
- were the additional data points that were used.</p><h5><strong>Cleaning the
108
- data</strong></h5><p>The data was cleaned up to remove stop words, various
109
- types of formulae, and XML from the titles. Stop words generally consist of
110
- articles, pronouns, conjunctions, and other frequently used words. The <a
111
- href=\"https://github.com/stopwords-iso/stopwords-iso\">stop words list</a>
112
- of all languages in the ISO-639 standard was used to process the titles. Some
113
- domain-specific terms to the stop words, such as “journal”, “archive”, “book”,
114
- “studies”, and so on, were also added to the list. Formulae and XML tags were
115
- removed with regular expressions. Rare subject categories that were assigned
116
- to very few journals (less than 50 out of 26000 journals)  were also removed.
117
- The cleaned data was now ready for processing. It was split into training,
118
- validation, and test sets.</p><h3>Methods</h3><p>This particular type of classification
119
- is known as a multi-label classification problem since zero, or many subjects
120
- can be assigned to a journal. Three methods were used to see which performed
121
- best.</p><h4><strong>TF-IDF + Linear Support Vector Classification</strong></h4><p>The
122
- first approach used the tf-idf and multilabel binarizer libraries from <a
123
- href=\"https://scikit-learn.org/stable/index.html\">scikit learn</a>. <a href=\"https://en.wikipedia.org/wiki/Tf%E2%80%93idf\">Tf-idf</a>
124
- is a numerical statistic that is intended to reflect how important a word
125
- is to a document in a collection. Using tf-idf, a  number of different strategies
126
- that can be used within a multi-label classification problem were benchmarked.
127
- The tf-idf vectorizer and multilabel binarizer are Python libraries that convert
128
- data into machine parseable vectors. Essentially, the data is a table of journal
129
- and article titles and their corresponding subjects.</p><p>A baseline prediction
130
- was needed to benchmark the performance of the strategies used. This prediction
131
- was made by comparing the presence of the subject codes assigned to the journal
132
- with the most common subject codes present in the corpus. The measure used
133
- to compare the performances was the micro <a href=\"https://en.wikipedia.org/wiki/F-score\">F1</a>
134
- score. The micro F1 score of the baseline prediction was 0.067. It shows that
135
- applying a naive approach will provide a prediction at 6.67% accuracy. That
136
- measure provided a good starting point to get an idea of the performance of
137
- subsequent methods.</p><p>Among the strategies used, the best-performing strategy
138
- was One vs Rest using LinearSVC. The micro F1 score was 0.43 after processing
139
- 20,000 features using the validation dataset. This was a decent increase from
140
- the baseline; however, it is still not very serviceable. In order to improve
141
- performance, it was decided to reduce the granularity of subjects. For example,
142
- the journal, <em>Journal of Children’s Services,</em> has several subjects
143
- assigned to them, such as Law, Sociology and Political Science'', Education,
144
- and Health. Elsevier’s ASJC subjects are in hierarchies. There are several
145
- subgroups of fields within some overarching fields. For example, the group,
146
- Medicine, has several specialities of medicine listed under it. The subjects,
147
- Social Sciences and Psychology work similarly. They are two separate fields
148
- of study, and the journal has articles that apply to either or both fields
149
- of study. The subjects listed in the  <em>Journal of Children’s Services </em>are
150
- in two different groups: Social Sciences and Psychology. Downgrading the granularity
151
- makes the learning process a little simpler. So, instead of the  <em>Journal
152
- of Children’s Services </em>belonging to several different subjects, the journal
153
- now belonged to two subjects. Using the same strategy, one vs rest with LinearSVC,
154
- we get an F1 score of 0.72 for the same number of titles. This was a marked
155
- improvement from before. There were other avenues that could be looked at,
156
- such as bringing in more data in the form of references, but there were also
157
- other methods to look at. We were curious about the role of embeddings and
158
- decided to pursue that approach.</p><h4><strong>Embeddings + Linear Support
159
- Vector Classification</strong></h4><p>This approach is slightly different
160
- from the tf-idf approach. For the titles, we decided to use a model that was
161
- already trained on a scientific corpus. For this, AllenAI’s <a href=\"https://github.com/allenai/scibert\">SciBERT</a>
162
- was used, a fine-tuned <a href=\"https://arxiv.org/abs/1810.04805\">BERT</a>
163
- model trained on papers from the corpus of <a href=\"https://semanticscholar.org\">semanticscholar.org</a>;
164
- a tool provided by AllenAI. The model provides an embedding: a vector representation
165
- of the titles, based on the data it has already been trained on. This allows
166
- it to provide more semantic weight on the data rather than simple occurrence
167
- of the words in the document (this occurs with the previous method, tf-idf).
168
- The generation of the embedding took over 18 hours on a laptop, but after
169
- that, generating predictions became quite fast. The amount of data needed
170
- to generate this vector is also lower than the tf-idf generation. The subjects
171
- were processed similarly to before and generated a vector using the multilabel
172
- binarizer. With 512 features from the titles (instead of 20,000) in the previous
173
- approach, the same strategy was used as earlier. Using the one vs rest strategy
174
- with LinearSVC the strategy was run against the validation set and got a F1
175
- score of 0.71. </p><p>So far, the tally is:</p><table>\n<thead>\n<tr>\n<th>Method</th>\n<th>F1
176
- Score</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>Tf-idf + multilabel binarizer</td>\n<td>0.73</td>\n</tr>\n<tr>\n<td>SciBERT
177
- embedding + multilabel binarizer</td>\n<td>0.71</td>\n</tr>\n</tbody>\n</table>\n<p>At
178
- this point, we were going to look into gathering more data points such as
179
- references and run a comparison between these two methods. However, large
180
- language models, especially ChatGPT, came into the zeitgeist, a few weeks
181
- into mulling over other options.</p><h4><strong>OpenAI: LLM + sentence completion</strong></h4><p>Out
182
- of curiosity, the author looked to see what chatGPT could do. ChatGPT was
183
- asked to figure out what topics an existing journal title belonged to, and
184
- it came very close to predicting the correct answer. The author also asked
185
- it to figure out to which topic multiple Dutch journal article titles belonged,
186
- and it predicted the correct answer again. The author decided to investigate
187
- this avenue knowing that if there were good results, open large language models
188
- would be used to see if there would be comparable results. The screenshot
189
- below shows the examples listed above.</p><figure><img src=\"https://upstream.force11.org/content/images/2023/08/openai_experiment.png\"
190
- loading=\"lazy\" width=\"1600\" height=\"1495\" srcset=\"https://upstream.force11.org/content/images/size/w600/2023/08/openai_experiment.png
191
- 600w, https://upstream.force11.org/content/images/size/w1000/2023/08/openai_experiment.png
192
- 1000w, https://upstream.force11.org/content/images/2023/08/openai_experiment.png
193
- 1600w\" /></figure><p>Subjects had to be processed a little differently for
194
- this model. The ASJC codes have subjects in text form as well as numerical
195
- values. For example, if there is a journal classified as “Medicine”, it has
196
- a code of “27”. The author fine-tuned the openAI model using their “ada” model
197
-   (it is the fastest and the cheapest) and sent it some sentence completion
198
- prompts. Essentially, this means that the model is being fine-tuned into telling
199
- it what subject codes it needs to complete the sentences that it is being
200
- sent. So, suppose several different titles are sent to the model and asked
201
- to complete it with several delimited subject codes. In that case, the model
202
- should be able to predict which subject codes should complete the sentences.
203
- A set of prompts were created with the journal titles and their corresponding
204
- subject codes as the sentence completion prompt to train the model. It looked
205
- like this:</p><p><strong><code>{\"prompt\":\"Lower Middle Ordovician carbon
206
- and oxygen…..,\"completion\":\" 11\\n19\"}</code></strong></p><p>The above
207
- snippet has several different titles where the subjects assigned to these
208
- titles are 11 and 19, which are <em>Agricultural and Biological Sciences</em>
209
- and<em> Earth and Planetary Sciences,</em> respectively.</p><p>The openAI’s
210
- API was used to fine-tune and train a model using the above prompts, and $10.00
211
- later, generated a model.</p><figure><img src=\"https://upstream.force11.org/content/images/2023/08/data-src-image-60e0df22-f6e0-4c81-adf0-fe21d2839897.png\"
212
- loading=\"lazy\" width=\"1600\" height=\"702\" srcset=\"https://upstream.force11.org/content/images/size/w600/2023/08/data-src-image-60e0df22-f6e0-4c81-adf0-fe21d2839897.png
213
- 600w, https://upstream.force11.org/content/images/size/w1000/2023/08/data-src-image-60e0df22-f6e0-4c81-adf0-fe21d2839897.png
214
- 1000w, https://upstream.force11.org/content/images/2023/08/data-src-image-60e0df22-f6e0-4c81-adf0-fe21d2839897.png
215
- 1600w\" /></figure><p>The validation dataset was run against the model and
216
- got a micro F1 score of 0.69. So, the tally now is:</p><table>\n<thead>\n<tr>\n<th>Method</th>\n<th>F1
217
- Score</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>Tf-idf + multilabel binarizer</td>\n<td>0.73</td>\n</tr>\n<tr>\n<td>SciBERT
218
- embedding + multilabel binarizer</td>\n<td>0.71</td>\n</tr>\n<tr>\n<td>ChatGPT
219
- + sentence completion</td>\n<td>0.69</td>\n</tr>\n</tbody>\n</table>\n<h3>Summary</h3><p>So,
220
- sad trombone, using three different methods, the F1 score is similar across
221
- all three methods. Essentially, we needed more data for more accurate predictions.
222
- Crossref has abstracts for a subset of the deposited publication metadata.
223
- Therefore, this data could not be used at this time for comparison. However,
224
- having that data could possibly yield better results. The only way to do that
225
- is to use a similar method to get those results. We do not have that currently,
226
- and so, for now,  it becomes a chicken and egg thought exercise. Getting even
227
- more data, such as full-text, could also produce interesting results, but
228
- we do not have the data for that either. For now, Crossref decided to remove
229
- the existing subject classifications that were present in some of our metadata.
230
- We could revisit the problem later - if we have more data. There are certainly
231
- interesting applications of these methods. We could:</p><ol><li>Look into
232
- topic clustering across our metadata and see what surfaces. This could also
233
- have applications in looking at the research zeitgeist across various time
234
- periods.</li><li>Measure the similarities of embeddings with each other to
235
- look at article similarities, which could yield interesting results in recommendations
236
- and search.<br /></li></ol><p>Automated subject classification also raises
237
- questions about fairness and bias in its algorithms and training and validation
238
- data. It would also be productive to clearly understand how the algorithm
239
- reaches its conclusions. Therefore, any automated system must be thoroughly
240
- tested, and anyone using it should have a very good understanding of what
241
- is happening within the algorithm.</p><p>This was an interesting exercise
242
- for the author to get acquainted with machine learning and become familiar
243
- with some of the available techniques.</p><p></p> ","published_at":1684834305,"updated_at":1691141202,"indexed_at":1691141631,"authors":[{"url":"https://orcid.org/0000-0001-9165-2757","name":"Esha
244
- Datta"}],"image":"https://upstream.force11.org/content/images/2023/05/esha-subject-blog.jpg","tags":["Original
245
- Research"],"language":"en","reference":[],"relationships":[],"blog_id":"pm0p222","blog_name":"Upstream","blog_slug":"upstream","blog":{"id":"pm0p222","title":"Upstream","description":"The
246
- community blog for all things Open Research.","language":"en","favicon":"https://upstream.force11.org/favicon.png","feed_url":"https://upstream.force11.org/atom-complete/","home_page_url":"https://upstream.force11.org","user_id":"08014cf6-3335-4588-96f4-c77ac1e535b2","created_at":"2023-01-13","feed_format":"application/atom+xml","license":"https://creativecommons.org/licenses/by/4.0/legalcode","generator":"Ghost
247
- 5.25","category":"humanities","prefix":"10.54900","modified_at":"2023-08-04T09:26:42+00:00","version":"https://jsonfeed.org/version/1.1","current_feed_url":"https://upstream.force11.org/atom/","status":"active","issn":null,"backlog":0,"authors":null,"plan":"Team","slug":"upstream","use_mastodon":false}}'
248
- recorded_at: Wed, 06 Sep 2023 14:50:20 GMT
41
+ based on different levels of subject specificity.","tags":["Original Research"],"title":"Attempts
42
+ at automating journal subject classification","updated_at":1691141202,"url":"https://upstream.force11.org/attempts-at-automating-journal-subject-classification"}
43
+
44
+ '
45
+ recorded_at: Thu, 05 Oct 2023 21:22:57 GMT
249
46
  recorded_with: VCR 6.2.0