anystyle 1.4.2 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5dd18e66e792426e7c6101a17785637f70b63b5384afe7db81406c1cebfe8708
4
- data.tar.gz: 1aada4ba80830fee3350e58f2eca4d9efddef8956864181b96983e54f41dcb9c
3
+ metadata.gz: 1ee4c3d7966aea4ee9ebe011d4a8bee37126caa2f0d1d44e3e238c66d15548f4
4
+ data.tar.gz: 6accc8256de87e852b4e8ef51485fce60fd566cf7614a1c1bd0b75c1e2ada9af
5
5
  SHA512:
6
- metadata.gz: 3b0bf084efbd329498bbfecae8ee2c24be0ab3335f7540c7c2d59ab79fc3d163c099cc8e27683e5fd31f7a21421ee09e44eedd835dd0688c40f8f477ca457c3a
7
- data.tar.gz: e17bb0d4745aa5947d100e81959b95bc515f697fa55f3f6500386a2231deda34858e161f63b8ce94fd53b682578997648a6db598c6396e8da5e0d3eff1d3e6c3
6
+ metadata.gz: 7d56bfa4eb7b43302a2cd48558242ea9cd636efa0625a2217f06cbf318d788c6ef01f3b706a143fb15c0bc843ac6ea3cefba8da6046052dd1bef3967e72fb029
7
+ data.tar.gz: 4ab2159fada17aa37ab70c17060ab04d5e92cf3ec5a112045f1035828dba49272d6542395f533c44daaf31686ad00b3ede99b3059eb3b653ebf3d2b153a0ba52
@@ -0,0 +1,50 @@
1
+ name: CI
2
+ on:
3
+ push:
4
+ branches: [ main ]
5
+ pull_request:
6
+ branches: [ main ]
7
+ concurrency:
8
+ group: ci-${{ github.ref }}
9
+ cancel-in-progress: true
10
+ jobs:
11
+ ci:
12
+ name: ${{ matrix.ruby-version }} ${{ matrix.friendlyName }}-${{ matrix.arch }}
13
+ runs-on: ${{ matrix.os }}
14
+
15
+ strategy:
16
+ matrix:
17
+ ruby-version:
18
+ - "3.2"
19
+ - "3.3"
20
+ - "3.4"
21
+ os:
22
+ - ubuntu-latest
23
+ - macos-latest
24
+ - windows-latest
25
+ arch:
26
+ - x64
27
+ include:
28
+ - os: ubuntu-latest
29
+ friendlyName: Linux
30
+ - os: macos-latest
31
+ friendlyName: macOS
32
+ - os: windows-latest
33
+ friendlyName: Windows
34
+
35
+ steps:
36
+ - name: Checkout repository
37
+ uses: actions/checkout@v4
38
+ - name: Setup Ruby and install bundle
39
+ uses: ruby/setup-ruby@v1
40
+ with:
41
+ ruby-version: ${{ matrix.ruby-version }}
42
+ bundler-cache: true
43
+ - name: Compile and run test
44
+ run: bundle exec rake
45
+ - name: Upload coverage results
46
+ if: matrix.ruby-version == '3.3'
47
+ continue-on-error: true
48
+ uses: coverallsapp/github-action@v2
49
+ with:
50
+ github-token: ${{ github.token }}
data/HISTORY.md CHANGED
@@ -1,6 +1,10 @@
1
+ 1.6.0 / 2025-05-11
2
+ ==================
3
+ * Added RIS output format (@ColorBlindHobbiest).
4
+
1
5
  1.4.0 / 2023-01-06
2
6
  ==================
3
- * Removed deprectate string taint checking (@bbonamin).
7
+ * Removed deprecated string taint checking (@bbonamin).
4
8
  * `AnyStyle::Parser#parse` will no longer automatically open local files.
5
9
  Please call `Wapiti::Dataset.open` explicitly if you relied on this.
6
10
 
@@ -17,7 +21,7 @@
17
21
  ==================
18
22
  * Updated and improved normalizers and CSL format.
19
23
  * Improved Chinese reference tokenization.
20
- * Added option to customizee pdftotext path.
24
+ * Added option to customize pdftotext path.
21
25
  * Improved Finder reference line joining.
22
26
  * Improved Finder model; training sets.
23
27
  * Improved Parser model; training sets.
@@ -41,15 +45,15 @@
41
45
 
42
46
  1.0.1 / 2018-06-06
43
47
  ==================
44
- * Initial 1.0 release! This release is not backwards compatible to the
45
- 0.x branch. The new release uses the `AnyStyle` module and can be
46
- installed using the `anystyle` Gem. The 0.x branch used the `Anystyle`
47
- module and can still be installed using the `anystyle-parser` Gem but
48
- will not be maintained any longer.
49
- * Includes vastly improved parser model and training sets.
50
- * Based on updated `wapiti-ruby` which builds on Linux, macOS, and
51
- Windows platforms (thanks @a-fent and @WouterJeuris).
52
- * Flexible normalizer architecture (normalizers can be skipped individually).
48
+ * Initial 1.0 release!
49
+ This release isn't backwards compatible to the 0.x branch.
50
+ The new release uses the `AnyStyle` module via the `anystyle` Gem.
51
+ The old 0.x branch used the `Anystyle` module via the`anystyle-parser` Gem
52
+ but isn't maintained any longer.
53
+ * Includes improved parser model and training sets.
54
+ * Based on updated `wapiti-ruby` which builds on Linux, macOS, and Windows platforms
55
+ (thanks @a-fent and @WouterJeuris).
56
+ * Flexible normalizer architecture (you can skip individual normalizers).
53
57
  * Improved feature architecture.
54
58
  * Improved input/output via Wapiti::Dataset.
55
59
  * New default dictionary adapter (thanks @a-fent).
data/README.md CHANGED
@@ -1,24 +1,24 @@
1
1
  AnyStyle
2
2
  ========
3
- [![Build Status](https://travis-ci.org/inukshuk/anystyle.svg?branch=master)](https://travis-ci.org/inukshuk/anystyle)
4
- [![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/anystyle?branch=master)
3
+ [![CI](https://github.com/inukshuk/anystyle/actions/workflows/ci.yml/badge.svg)](https://github.com/inukshuk/anystyle/actions/workflows/ci.yml)
4
+ [![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle/badge.svg)](https://coveralls.io/github/inukshuk/anystyle)
5
5
 
6
- AnyStyle is a very fast and smart parser for academic references. It
7
- was originally inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/)
8
- and [FreeCite](http://freecite.library.brown.edu/); AnyStyle uses machine
9
- learning algorithms and aims to make it easy to train the model with data
10
- that is relevant to your parsing needs.
6
+ AnyStyle is a fast and smart parser of bibliographic references.
7
+ Originally inspired by [parsCit][] and [FreeCite][],
8
+ AnyStyle uses machine learning algorithms
9
+ and aims to make it easy to train models
10
+ with data that's relevant to you.
11
11
 
12
12
 
13
- Using AnyStyle CLI
14
- ------------------
15
-
13
+ Using AnyStyle on the command line
14
+ ----------------------------------
16
15
  $ [sudo] gem install anystyle-cli
17
16
  $ anystyle --help
18
17
  $ anystyle help find
19
18
  $ anystyle help parse
20
19
 
21
- See [anystyle-cli](https://github.com/inukshuk/anystyle-cli) for more details.
20
+ See [anystyle-cli][] for more details.
21
+
22
22
 
23
23
  Using AnyStyle in Ruby
24
24
  ----------------------
@@ -26,8 +26,9 @@ Install the `anystyle` gem.
26
26
 
27
27
  $ [sudo] gem install anystyle
28
28
 
29
- Once installed, you can use the static Parser and Finder instances
30
- by calling the `AnyStyle.parse` or `AnyStyle.find` methods. For example:
29
+ Now you can use the static Parser and Finder instances
30
+ by calling the `AnyStyle.parse` or `AnyStyle.find` methods.
31
+ For example:
31
32
 
32
33
  ```ruby
33
34
  require 'anystyle'
@@ -46,77 +47,86 @@ pp AnyStyle.parse 'Derrida, J. (1967). L’écriture et la différence (1 éd.).
46
47
  #}]
47
48
  ```
48
49
 
49
- Alternatively, you can create your own `AnyStyle::Parser` or
50
- `AnyStyle::Finder` with custom options.
50
+ You can also create your own
51
+ `AnyStyle::Parser` or `AnyStyle::Finder` with custom options.
51
52
 
52
53
 
53
- Using the AnyStyle Web App
54
- --------------------------
55
- AnyStyle is available as web application at [anystyle.io](https://anystyle.io).
54
+ Using AnyStyle on the web
55
+ -------------------------
56
+ AnyStyle is available at [anystyle.io][].
56
57
 
57
- The web application [is open source](https://github.com/inukshuk/anystyle.io)
58
- and you can also host yourself!
58
+ The web application is [open source][]
59
+ and you're welcome to host your own instance!
59
60
 
60
- Improving results for your data
61
- =================================
61
+ [anystyle-cli]: https://github.com/inukshuk/anystyle-cli
62
+ [anystyle.io]: https://anystyle.io
63
+ [open source]: https://github.com/inukshuk/anystyle.io
64
+ [parsCit]: http://aye.comp.nus.edu.sg/parsCit/
65
+ [FreeCite]: http://freecite.library.brown.edu/
62
66
 
67
+
68
+ Improving results for your data
69
+ ===============================
63
70
  Training
64
71
  --------
65
- You can train custom Finder and Parser models. To do this, you need
66
- to prepare your own data sets for training. You can create your own
67
- data from scratch or build on AnyStyle's default sets. The default
68
- parser model is based on the
69
- [core](https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml)
70
- data set; the default finder model source data is not publicly
71
- available in its entirety, but you can find a number of tagged
72
- documents
73
- [here](https://github.com/inukshuk/anystyle/blob/master/res/finder).
74
-
75
- When you have compiled a data set for training, you will be ready
76
- to create your own model:
72
+ You can train custom Finder and Parser models.
73
+ To do this, you need to prepare your own data sets for training.
74
+ You can create your own data from scratch
75
+ or build on AnyStyle's default sets.
76
+ The default parser model uses the [core][] data set.
77
+ And though the finder model sources aren't available in their entirety,
78
+ due to copyright restrictions,
79
+ you can find several [tagged documents][] here.
80
+
81
+ When you have compiled a data set for training,
82
+ you will be ready to create your own model:
77
83
 
78
84
  $ anystyle train training-data.xml custom.mod
79
85
 
80
- This will save your new model as `custom.mod`. To use your model
81
- instead of AnyStyle's default, use the `-P` or `--parser-model` flag
82
- and, respectively, `-F` or `--finder-model` to use a custom Finder
83
- model. For instance, the command below would parse all references
84
- in `bib.txt` using the custom model we just trained and print the
85
- result to STDOUT using the JSON output format:
86
+ This will save your new model as `custom.mod`.
87
+ To use your model instead of AnyStyle's default,
88
+ use the `-P` or `--parser-model` flag and, respectively,
89
+ `-F` or `--finder-model` to use a custom finder model.
90
+ For instance, the command below
91
+ will parse a file `bib.txt` with the custom model
92
+ and print the result to STDOUT in JSON format:
86
93
 
87
94
  $ anystyle -P custom.mod -f json parse bib.txt -
88
95
 
89
- When training your own models, it is good practice to check the
90
- quality using a second data set. For example, using AnyStyle's own
91
- [gold](https://github.com/inukshuk/anystyle/blob/master/res/parser/gold.xml)
92
- data set (a large, manually curated data set) we could check our
93
- custom model like this:
96
+ When training your own models, it's good practice
97
+ to check their quality using a second data set.
98
+ For example, to check your custom model
99
+ using AnyStyle's manually curated [gold][] data set:
94
100
 
95
101
  $ anystyle -P x.mod check ./res/parser/gold.xml
96
102
  Checking gold.xml................. 1 seq 0.06% 3 tok 0.01% 3s
97
103
 
98
- This command will print the sequence and token error rates; in
99
- the case of AnyStyle a the number of sequence errors is the number
100
- of references which were tagged differently by the parser than they
101
- were in the input; the number of token errors is the total number of
102
- words across all the references which were tagged differently. In the
103
- example above, we got one reference wrong (out of 1700 at the time);
104
- but even this one reference was mostly tagged correctly, because only
105
- a total of 3 words were tagged differently.
104
+ This command prints sequence and token error rates.
105
+ Here, sequence errors are the number of references
106
+ tagged differently by the parser
107
+ as compared to the curated input;
108
+ the number of token errors
109
+ is the total number of words in these references.
110
+ In the example above, one reference was wrong
111
+ (out of 1,700 at the time),
112
+ because a total of three words had a different tag.
113
+
114
+ When working with training data,
115
+ it's a good idea to use the `Wapiti::Dataset` API in Ruby:
116
+ it supports standard set operators
117
+ and makes it easy to combine or compare data sets.
118
+
119
+ [core]: https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml
120
+ [gold]: https://github.com/inukshuk/anystyle/blob/master/res/parser/gold.xml
121
+ [tagged documents]: https://github.com/inukshuk/anystyle/blob/master/res/finder
106
122
 
107
- When working with training data, it is a good idea to use the
108
- `Wapiti::Dataset` API in Ruby: it supports all the standard set
109
- operators and makes it very easy to combine or compare data sets.
110
123
 
111
124
  Natural Languages used in AnyStyle
112
125
  ----------------------------------
113
-
114
- As mentioned above, the
115
- [core](https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml)
116
- dataset contains the manually marked-up references that are used as the
117
- basis for the default AnyStyle parsing model. If the references you are
118
- trying to parse include many non-English documents, the distribution of
119
- natural languages in this corpus is relevant (detected using [cld](https://github.com/jtoy/cld)).
126
+ The [core][] data set contains the manually marked-up references
127
+ which comprise AnyStyle's default parser model.
128
+ If your references include non-English documents,
129
+ the distribution of natural languages in this corpus is relevant.
120
130
 
121
131
  | Language | n |
122
132
  |-------------------------|-----|
@@ -129,42 +139,51 @@ natural languages in this corpus is relevant (detected using [cld](https://githu
129
139
  | Not reliably determined | 449 |
130
140
  | (but mainly English) | |
131
141
 
132
- (These data are based on AnyStyle version 1.3.13)
142
+ (Measured using [cld][] and AnyStyle version 1.3.13)
133
143
 
134
144
  There is a strong prevalence of English-language documents with the
135
- conventions used in English-language bibliographies, with some
136
- representation of other European languages. The languages used reflect
137
- those used in scientific publishing as well as the maintainers'
138
- competencies. If you are working with many documents in languages other
139
- than English, you might consider training the model with some examples
145
+ conventions used in English-language bibliographies,
146
+ with some representation of other European languages.
147
+ The languages used reflect those used in scientific publishing
148
+ as well as the maintainers' competencies.
149
+ If you are working with documents in languages other than English,
150
+ you might consider training the model with some examples
140
151
  in the relevant languages.
141
152
 
142
- AnyStyle should work with references written in any Latin script
143
- (including most European languages, languages such as Indonesian and
144
- Malaysian, as well as romanised Arabic, Chinese and Japanese). It should
145
- also support languages written with non-Latin alphabets (such as
146
- Russian), although no examples of these appear in the default training
147
- sets. Languages written in syllabaries or complex symbols which do not
148
- use white space to separate tokens are not compatible with AnyStyle's
149
- approach: this includes Chinese, Japanese, Arabic as well as many Indian
150
- languages.
153
+ AnyStyle works with references written in any Latin script,
154
+ including most European languages,
155
+ languages such as Indonesian and Malaysian,
156
+ as well as romanized Arabic, Chinese and Japanese.
157
+ It also supports non-Latin alphabets such as Cyrillic,
158
+ although no examples of these appear in the default training sets.
159
+ Languages written in syllabaries or complex symbols
160
+ which don't use white space to separate tokens
161
+ aren't compatible with AnyStyle's approach:
162
+ this includes Chinese, Japanese, Arabic, and Indian languages.
163
+
164
+ [cld]: https://github.com/jtoy/cld
165
+
151
166
 
152
167
  Dictionary Adapters
153
168
  -------------------
154
- During the statistical analysis of reference strings, AnyStyle relies
155
- on a large feature dictionary; by default, AnyStyle creates a persistent
156
- Ruby Hash in the folder of the `anystyle-data` Gem. This uses up about
157
- 2MB of disk space and keeps the entire dictionary in memory. If you prefer
158
- a smaller memory footprint, you can alternatively use AnyStyle's GDBM
159
- dictionary. GDBM bindings are part of the Ruby standard library and are
160
- supported on all platforms, but you may have to install GDBM on your
161
- platform before installing Ruby.
162
-
163
- If you do not want to use the the persistent Ruyb Hash nor the GBDM
164
- bindings, you can store your dictionary in memory (not recommended) or
165
- use a Redis. The best way to change the default dictionary adapter is by
166
- adjusting AnyStyle's default configuration (when using the default parser
167
- instances you must set the default before using the parser):
169
+ During the statistical analysis of reference strings,
170
+ AnyStyle relies on a large feature dictionary;
171
+ by default, AnyStyle creates a persistent Ruby hash
172
+ in the folder of the `anystyle-data` Gem.
173
+ This uses up about 2MB of disk space
174
+ and keeps the entire dictionary in memory.
175
+ If you prefer a smaller memory footprint,
176
+ you can use AnyStyle's GDBM dictionary.
177
+ GDBM bindings are part of the Ruby standard library
178
+ and supported on all platforms,
179
+ though you may need to install GDBM before installing Ruby.
180
+
181
+ If you don't want to use the persistent Ruby hash nor GBDM,
182
+ you can store your dictionary in memory or use a Redis.
183
+ The best way to change the default dictionary adapter
184
+ is by adjusting AnyStyle's default configuration
185
+ (when using the static parser instances
186
+ you must set the default before using the parser):
168
187
 
169
188
  AnyStyle::Dictionary.defaults[:adapter] = :ruby
170
189
  #-> Use a persistent Ruby hash;
@@ -186,34 +205,39 @@ and configure AnyStyle to use the Redis adapter:
186
205
  AnyStyle::Dictionary::Redis.defaults[:host] = 'localhost'
187
206
  AnyStyle::Dictionary::Redis.defaults[:port] = 6379
188
207
 
208
+
189
209
  About AnyStyle
190
210
  ==============
191
211
  Contributing
192
212
  ------------
193
- The AnyStyle source code is
194
- [hosted on GitHub](https://github.com/inukshuk/anystyle/).
213
+ The AnyStyle source code is hosted on [GitHub][].
195
214
  You can check out a copy of the latest code using Git:
196
215
 
197
216
  $ git clone https://github.com/inukshuk/anystyle.git
198
217
 
199
- If you've found a bug or have a question, please open an issue on the
200
- [AnyStyle issue tracker](https://github.com/inukshuk/anystyle/issues).
201
- Or, for extra credit, clone the AnyStyle repository, write a failing
202
- example, fix the bug and submit a pull request.
218
+ If you've found a bug or have a question,
219
+ please [report the issue][] or,
220
+ for extra credit, clone the AnyStyle repository,
221
+ write a failing example, fix the bug and submit a pull request.
222
+
223
+ [GitHub]: https://github.com/inukshuk/anystyle/
224
+ [report the issue]: https://github.com/inukshuk/anystyle/issues
225
+
203
226
 
204
227
  Credits
205
228
  -------
206
- AnyStyle is a volunteer effort and we encourage you
207
- to join us! Over the years our main contributors have been:
229
+ AnyStyle is a volunteer effort and you're encourage to join!
230
+ Over the years the main contributors have been:
208
231
 
209
232
  * [Alex Fenton](https://github.com/a-fent)
210
233
  * [Sylvester Keil](https://github.com/inukshuk)
211
234
  * [Johannes Krtek](https://github.com/flachware)
212
235
  * [Ilja Srna](https://github.com/namyra)
213
236
 
237
+
214
238
  License
215
239
  -------
216
240
  Copyright 2011-2023 Sylvester Keil. All rights reserved.
217
241
 
218
242
  AnyStyle is distributed under a BSD-style license.
219
- See LICENSE for details.
243
+ See [LICENSE](./LICENSE) for details.
@@ -1,3 +1,4 @@
1
+ # coding: utf-8
1
2
  module AnyStyle
2
3
  class Feature
3
4
  class Keyword < Feature
@@ -12,7 +13,7 @@ module AnyStyle
12
13
  /^(compilador)$/i,
13
14
  /編/
14
15
  :editor
15
- when /著|撰/,
16
+ when /著|撰/
16
17
  :author
17
18
  when /^trans(l(ated|ators?|ation))?$/i,
18
19
  /^übers(etz(t|ung))?$/i,
@@ -0,0 +1,78 @@
1
+ module AnyStyle
2
+ module Format
3
+ module RIS
4
+ def format_ris(dataset, **opts)
5
+ format_hash(dataset).map { |entry| format_entry(entry) }.join("\n\n") + "\n"
6
+ end
7
+
8
+ def format_entry(entry)
9
+ lines = []
10
+
11
+ type = ris_type(entry[:type])
12
+ lines << "TY - #{type}"
13
+
14
+ add_authors(lines, entry[:author])
15
+ lines << "PY - #{unwrap(entry[:issued])}" if entry[:issued]
16
+ lines << "TI - #{unwrap(entry[:title])}" if entry[:title]
17
+ lines << "T2 - #{unwrap(entry[:'container-title'])}" if entry[:'container-title']
18
+ lines << "PB - #{unwrap(entry[:publisher])}" if entry[:publisher]
19
+ lines << "SN - #{unwrap(entry[:ISBN] || entry[:ISSN])}" if entry[:ISBN] || entry[:ISSN]
20
+ lines << "DO - #{unwrap(entry[:DOI])}" if entry[:DOI]
21
+ lines << "UR - #{unwrap(entry[:URL])}" if entry[:URL]
22
+ lines << "ET - #{unwrap(entry[:edition])}" if entry[:edition]
23
+ lines << "CY - #{unwrap(entry[:'publisher-place'] || entry[:location])}" if entry[:'publisher-place'] || entry[:location]
24
+ lines << "VL - #{unwrap(entry[:volume])}" if entry[:volume]
25
+ lines << "IS - #{unwrap(entry[:issue])}" if entry[:issue]
26
+ lines << "SP - #{unwrap(entry[:page].to_s.split('-')[0])}" if entry[:page]
27
+ lines << "EP - #{unwrap(entry[:page].to_s.split('-')[1])}" if entry[:page]&.include?("-")
28
+ lines << "ER -"
29
+
30
+ lines.join("\n")
31
+ end
32
+
33
+ # Extended RIS type mapping
34
+ def ris_type(type)
35
+ case type.to_s.downcase
36
+ when 'book' then 'BOOK' # Book
37
+ when 'chapter' then 'CHAP' # Book chapter
38
+ when 'article-journal' then 'JOUR' # Journal article
39
+ when 'magazine-article', 'magazine' then 'MGZN' # Magazine
40
+ when 'newspaper-article', 'news' then 'NEWS' # Newspaper
41
+ when 'conference-paper', 'proceedings-article' then 'CONF' # Conference
42
+ when 'manuscript' then 'UNPB' # Unpublished
43
+ when 'thesis' then 'THES' # Thesis/dissertation
44
+ when 'webpage', 'electronic', 'online' then 'ELEC' # Electronic source
45
+ when 'film' then 'MPCT' # Motion picture
46
+ when 'report' then 'RPRT' # Technical report
47
+ else 'GEN' # Generic fallback
48
+ end
49
+ end
50
+
51
+ def unwrap(val)
52
+ val.is_a?(Array) ? val.first : val
53
+ end
54
+
55
+ def add_authors(lines, authors)
56
+ return unless authors
57
+
58
+ authors.each do |author|
59
+ name = if author[:literal]
60
+ author[:literal]
61
+ elsif author[:family] || author[:given]
62
+ family = author[:family]
63
+ given = author[:given]&.gsub('.', '')
64
+
65
+ # Add space between adjacent uppercase initials (e.g., "HJ" => "H J")
66
+ given = given.gsub(/(?<=\A|\s)([A-Z])(?=[A-Z])/, '\1 ') if given
67
+
68
+ [family, given].compact.join(', ')
69
+ else
70
+ nil
71
+ end
72
+
73
+ lines << "AU - #{name}" if name
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -1,11 +1,13 @@
1
1
  module AnyStyle
2
- maybe_require 'language_detector'
2
+ maybe_require 'cld3'
3
3
  maybe_require 'unicode/scripts'
4
4
 
5
5
  class Normalizer
6
6
  class Locale < Normalizer
7
7
  def initialize
8
- @ld = LanguageDetector.new if defined?(LanguageDetector)
8
+ if defined?(::CLD3)
9
+ @ld = ::CLD3::NNetLanguageIdentifier.new(0, 1000)
10
+ end
9
11
  end
10
12
 
11
13
  def normalize(item, **opts)
@@ -24,14 +26,16 @@ module AnyStyle
24
26
  language = detect_language(sample)
25
27
  scripts = detect_scripts(sample)
26
28
 
27
- item[:language] ||= language unless language.nil?
29
+ item[:language] ||= language.to_s unless language.nil?
28
30
  item[:scripts] ||= scripts unless scripts.nil?
29
31
  item
30
32
  end
31
33
  end
32
34
 
33
35
  def detect_language(string)
34
- @ld.detect(string) unless @ld.nil?
36
+ if instance_variable_defined?('@ld') && string.length > 8
37
+ @ld.find_language(string).language
38
+ end
35
39
  end
36
40
 
37
41
  def detect_scripts(string)
@@ -96,8 +96,9 @@ module AnyStyle
96
96
  class Parser < ParserCore
97
97
  include Format::BibTeX
98
98
  include Format::CSL
99
+ include Format::RIS
99
100
 
100
- @formats = [:bibtex, :citeproc, :csl, :hash, :wapiti]
101
+ @formats = [:bibtex, :citeproc, :csl, :hash, :wapiti, :ris]
101
102
 
102
103
  @defaults = {
103
104
  model: File.join(SUPPORT, 'parser.mod'),
@@ -105,7 +106,7 @@ module AnyStyle
105
106
  compact: true,
106
107
  threads: 4,
107
108
  separator: /(?:\r?\n)+/,
108
- delimiter: /\s+|([\uFF01-\uFF64]|。|、)/,
109
+ delimiter: /(\s|\p{Space_Separator})+|([\uFF01-\uFF64]|。|、)/,
109
110
  format: :hash,
110
111
  training_data: File.join(RES, 'parser', 'core.xml')
111
112
  }
@@ -190,7 +191,7 @@ module AnyStyle
190
191
  case format.to_sym
191
192
  when :wapiti
192
193
  label(input, **opts)
193
- when :hash, :bibtex, :citeproc, :csl
194
+ when :hash, :bibtex, :citeproc, :csl, :ris
194
195
  formatter = "format_#{format}".to_sym
195
196
  send(formatter, label(input, **opts), **opts)
196
197
  else
@@ -1,3 +1,3 @@
1
1
  module AnyStyle
2
- VERSION = '1.4.2'.freeze
2
+ VERSION = '1.6.0'.freeze
3
3
  end
data/lib/anystyle.rb CHANGED
@@ -49,6 +49,7 @@ require 'anystyle/normalizer/volume'
49
49
 
50
50
  require 'anystyle/format/bibtex'
51
51
  require 'anystyle/format/csl'
52
+ require 'anystyle/format/ris'
52
53
 
53
54
  require 'anystyle/page'
54
55
  require 'anystyle/refs'
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.2
4
+ version: 1.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sylvester Keil
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2023-03-27 00:00:00.000000000 Z
10
+ date: 2025-05-11 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: bibtex-ruby
@@ -77,6 +76,7 @@ extra_rdoc_files:
77
76
  - LICENSE
78
77
  files:
79
78
  - ".gitattributes"
79
+ - ".github/workflows/ci.yml"
80
80
  - HISTORY.md
81
81
  - LICENSE
82
82
  - README.md
@@ -108,6 +108,7 @@ files:
108
108
  - lib/anystyle/finder.rb
109
109
  - lib/anystyle/format/bibtex.rb
110
110
  - lib/anystyle/format/csl.rb
111
+ - lib/anystyle/format/ris.rb
111
112
  - lib/anystyle/normalizer.rb
112
113
  - lib/anystyle/normalizer/arxiv.rb
113
114
  - lib/anystyle/normalizer/brackets.rb
@@ -161,7 +162,6 @@ homepage: http://anystyle.io
161
162
  licenses:
162
163
  - BSD-2-Clause
163
164
  metadata: {}
164
- post_install_message:
165
165
  rdoc_options:
166
166
  - "--line-numbers"
167
167
  - "--inline-source"
@@ -182,8 +182,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
182
  - !ruby/object:Gem::Version
183
183
  version: '0'
184
184
  requirements: []
185
- rubygems_version: 3.4.2
186
- signing_key:
185
+ rubygems_version: 3.6.6
187
186
  specification_version: 4
188
187
  summary: Smart and fast bibliography parser.
189
188
  test_files: []