anystyle 1.4.2 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +50 -0
- data/HISTORY.md +15 -11
- data/README.md +125 -101
- data/lib/anystyle/feature/keyword.rb +2 -1
- data/lib/anystyle/format/ris.rb +78 -0
- data/lib/anystyle/normalizer/locale.rb +8 -4
- data/lib/anystyle/parser.rb +4 -3
- data/lib/anystyle/version.rb +1 -1
- data/lib/anystyle.rb +1 -0
- metadata +5 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1ee4c3d7966aea4ee9ebe011d4a8bee37126caa2f0d1d44e3e238c66d15548f4
|
4
|
+
data.tar.gz: 6accc8256de87e852b4e8ef51485fce60fd566cf7614a1c1bd0b75c1e2ada9af
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d56bfa4eb7b43302a2cd48558242ea9cd636efa0625a2217f06cbf318d788c6ef01f3b706a143fb15c0bc843ac6ea3cefba8da6046052dd1bef3967e72fb029
|
7
|
+
data.tar.gz: 4ab2159fada17aa37ab70c17060ab04d5e92cf3ec5a112045f1035828dba49272d6542395f533c44daaf31686ad00b3ede99b3059eb3b653ebf3d2b153a0ba52
|
@@ -0,0 +1,50 @@
|
|
1
|
+
name: CI
|
2
|
+
on:
|
3
|
+
push:
|
4
|
+
branches: [ main ]
|
5
|
+
pull_request:
|
6
|
+
branches: [ main ]
|
7
|
+
concurrency:
|
8
|
+
group: ci-${{ github.ref }}
|
9
|
+
cancel-in-progress: true
|
10
|
+
jobs:
|
11
|
+
ci:
|
12
|
+
name: ${{ matrix.ruby-version }} ${{ matrix.friendlyName }}-${{ matrix.arch }}
|
13
|
+
runs-on: ${{ matrix.os }}
|
14
|
+
|
15
|
+
strategy:
|
16
|
+
matrix:
|
17
|
+
ruby-version:
|
18
|
+
- "3.2"
|
19
|
+
- "3.3"
|
20
|
+
- "3.4"
|
21
|
+
os:
|
22
|
+
- ubuntu-latest
|
23
|
+
- macos-latest
|
24
|
+
- windows-latest
|
25
|
+
arch:
|
26
|
+
- x64
|
27
|
+
include:
|
28
|
+
- os: ubuntu-latest
|
29
|
+
friendlyName: Linux
|
30
|
+
- os: macos-latest
|
31
|
+
friendlyName: macOS
|
32
|
+
- os: windows-latest
|
33
|
+
friendlyName: Windows
|
34
|
+
|
35
|
+
steps:
|
36
|
+
- name: Checkout repository
|
37
|
+
uses: actions/checkout@v4
|
38
|
+
- name: Setup Ruby and install bundle
|
39
|
+
uses: ruby/setup-ruby@v1
|
40
|
+
with:
|
41
|
+
ruby-version: ${{ matrix.ruby-version }}
|
42
|
+
bundler-cache: true
|
43
|
+
- name: Compile and run test
|
44
|
+
run: bundle exec rake
|
45
|
+
- name: Upload coverage results
|
46
|
+
if: matrix.ruby-version == '3.3'
|
47
|
+
continue-on-error: true
|
48
|
+
uses: coverallsapp/github-action@v2
|
49
|
+
with:
|
50
|
+
github-token: ${{ github.token }}
|
data/HISTORY.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
1.6.0 / 2025-05-11
|
2
|
+
==================
|
3
|
+
* Added RIS output format (@ColorBlindHobbiest).
|
4
|
+
|
1
5
|
1.4.0 / 2023-01-06
|
2
6
|
==================
|
3
|
-
* Removed
|
7
|
+
* Removed deprecated string taint checking (@bbonamin).
|
4
8
|
* `AnyStyle::Parser#parse` will no longer automatically open local files.
|
5
9
|
Please call `Wapiti::Dataset.open` explicitly if you relied on this.
|
6
10
|
|
@@ -17,7 +21,7 @@
|
|
17
21
|
==================
|
18
22
|
* Updated and improved normalizers and CSL format.
|
19
23
|
* Improved Chinese reference tokenization.
|
20
|
-
* Added option to
|
24
|
+
* Added option to customize pdftotext path.
|
21
25
|
* Improved Finder reference line joining.
|
22
26
|
* Improved Finder model; training sets.
|
23
27
|
* Improved Parser model; training sets.
|
@@ -41,15 +45,15 @@
|
|
41
45
|
|
42
46
|
1.0.1 / 2018-06-06
|
43
47
|
==================
|
44
|
-
* Initial 1.0 release!
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
* Includes
|
50
|
-
* Based on updated `wapiti-ruby` which builds on Linux, macOS, and
|
51
|
-
|
52
|
-
* Flexible normalizer architecture (
|
48
|
+
* Initial 1.0 release!
|
49
|
+
This release isn't backwards compatible to the 0.x branch.
|
50
|
+
The new release uses the `AnyStyle` module via the `anystyle` Gem.
|
51
|
+
The old 0.x branch used the `Anystyle` module via the`anystyle-parser` Gem
|
52
|
+
but isn't maintained any longer.
|
53
|
+
* Includes improved parser model and training sets.
|
54
|
+
* Based on updated `wapiti-ruby` which builds on Linux, macOS, and Windows platforms
|
55
|
+
(thanks @a-fent and @WouterJeuris).
|
56
|
+
* Flexible normalizer architecture (you can skip individual normalizers).
|
53
57
|
* Improved feature architecture.
|
54
58
|
* Improved input/output via Wapiti::Dataset.
|
55
59
|
* New default dictionary adapter (thanks @a-fent).
|
data/README.md
CHANGED
@@ -1,24 +1,24 @@
|
|
1
1
|
AnyStyle
|
2
2
|
========
|
3
|
-
[](https://github.com/inukshuk/anystyle/actions/workflows/ci.yml)
|
4
|
+
[](https://coveralls.io/github/inukshuk/anystyle)
|
5
5
|
|
6
|
-
AnyStyle is a
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
that
|
6
|
+
AnyStyle is a fast and smart parser of bibliographic references.
|
7
|
+
Originally inspired by [parsCit][] and [FreeCite][],
|
8
|
+
AnyStyle uses machine learning algorithms
|
9
|
+
and aims to make it easy to train models
|
10
|
+
with data that's relevant to you.
|
11
11
|
|
12
12
|
|
13
|
-
Using AnyStyle
|
14
|
-
|
15
|
-
|
13
|
+
Using AnyStyle on the command line
|
14
|
+
----------------------------------
|
16
15
|
$ [sudo] gem install anystyle-cli
|
17
16
|
$ anystyle --help
|
18
17
|
$ anystyle help find
|
19
18
|
$ anystyle help parse
|
20
19
|
|
21
|
-
See [anystyle-cli]
|
20
|
+
See [anystyle-cli][] for more details.
|
21
|
+
|
22
22
|
|
23
23
|
Using AnyStyle in Ruby
|
24
24
|
----------------------
|
@@ -26,8 +26,9 @@ Install the `anystyle` gem.
|
|
26
26
|
|
27
27
|
$ [sudo] gem install anystyle
|
28
28
|
|
29
|
-
|
30
|
-
by calling the `AnyStyle.parse` or `AnyStyle.find` methods.
|
29
|
+
Now you can use the static Parser and Finder instances
|
30
|
+
by calling the `AnyStyle.parse` or `AnyStyle.find` methods.
|
31
|
+
For example:
|
31
32
|
|
32
33
|
```ruby
|
33
34
|
require 'anystyle'
|
@@ -46,77 +47,86 @@ pp AnyStyle.parse 'Derrida, J. (1967). L’écriture et la différence (1 éd.).
|
|
46
47
|
#}]
|
47
48
|
```
|
48
49
|
|
49
|
-
|
50
|
-
`AnyStyle::Finder` with custom options.
|
50
|
+
You can also create your own
|
51
|
+
`AnyStyle::Parser` or `AnyStyle::Finder` with custom options.
|
51
52
|
|
52
53
|
|
53
|
-
Using
|
54
|
-
|
55
|
-
AnyStyle is available
|
54
|
+
Using AnyStyle on the web
|
55
|
+
-------------------------
|
56
|
+
AnyStyle is available at [anystyle.io][].
|
56
57
|
|
57
|
-
The web application
|
58
|
-
and you
|
58
|
+
The web application is [open source][]
|
59
|
+
and you're welcome to host your own instance!
|
59
60
|
|
60
|
-
|
61
|
-
|
61
|
+
[anystyle-cli]: https://github.com/inukshuk/anystyle-cli
|
62
|
+
[anystyle.io]: https://anystyle.io
|
63
|
+
[open source]: https://github.com/inukshuk/anystyle.io
|
64
|
+
[parsCit]: http://aye.comp.nus.edu.sg/parsCit/
|
65
|
+
[FreeCite]: http://freecite.library.brown.edu/
|
62
66
|
|
67
|
+
|
68
|
+
Improving results for your data
|
69
|
+
===============================
|
63
70
|
Training
|
64
71
|
--------
|
65
|
-
You can train custom Finder and Parser models.
|
66
|
-
to prepare your own data sets for training.
|
67
|
-
|
68
|
-
|
69
|
-
[core]
|
70
|
-
|
71
|
-
|
72
|
-
documents
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
to create your own model:
|
72
|
+
You can train custom Finder and Parser models.
|
73
|
+
To do this, you need to prepare your own data sets for training.
|
74
|
+
You can create your own data from scratch
|
75
|
+
or build on AnyStyle's default sets.
|
76
|
+
The default parser model uses the [core][] data set.
|
77
|
+
And though the finder model sources aren't available in their entirety,
|
78
|
+
due to copyright restrictions,
|
79
|
+
you can find several [tagged documents][] here.
|
80
|
+
|
81
|
+
When you have compiled a data set for training,
|
82
|
+
you will be ready to create your own model:
|
77
83
|
|
78
84
|
$ anystyle train training-data.xml custom.mod
|
79
85
|
|
80
|
-
This will save your new model as `custom.mod`.
|
81
|
-
instead of AnyStyle's default,
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
+
This will save your new model as `custom.mod`.
|
87
|
+
To use your model instead of AnyStyle's default,
|
88
|
+
use the `-P` or `--parser-model` flag and, respectively,
|
89
|
+
`-F` or `--finder-model` to use a custom finder model.
|
90
|
+
For instance, the command below
|
91
|
+
will parse a file `bib.txt` with the custom model
|
92
|
+
and print the result to STDOUT in JSON format:
|
86
93
|
|
87
94
|
$ anystyle -P custom.mod -f json parse bib.txt -
|
88
95
|
|
89
|
-
When training your own models, it
|
90
|
-
quality using a second data set.
|
91
|
-
|
92
|
-
|
93
|
-
custom model like this:
|
96
|
+
When training your own models, it's good practice
|
97
|
+
to check their quality using a second data set.
|
98
|
+
For example, to check your custom model
|
99
|
+
using AnyStyle's manually curated [gold][] data set:
|
94
100
|
|
95
101
|
$ anystyle -P x.mod check ./res/parser/gold.xml
|
96
102
|
Checking gold.xml................. 1 seq 0.06% 3 tok 0.01% 3s
|
97
103
|
|
98
|
-
This command
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
104
|
+
This command prints sequence and token error rates.
|
105
|
+
Here, sequence errors are the number of references
|
106
|
+
tagged differently by the parser
|
107
|
+
as compared to the curated input;
|
108
|
+
the number of token errors
|
109
|
+
is the total number of words in these references.
|
110
|
+
In the example above, one reference was wrong
|
111
|
+
(out of 1,700 at the time),
|
112
|
+
because a total of three words had a different tag.
|
113
|
+
|
114
|
+
When working with training data,
|
115
|
+
it's a good idea to use the `Wapiti::Dataset` API in Ruby:
|
116
|
+
it supports standard set operators
|
117
|
+
and makes it easy to combine or compare data sets.
|
118
|
+
|
119
|
+
[core]: https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml
|
120
|
+
[gold]: https://github.com/inukshuk/anystyle/blob/master/res/parser/gold.xml
|
121
|
+
[tagged documents]: https://github.com/inukshuk/anystyle/blob/master/res/finder
|
106
122
|
|
107
|
-
When working with training data, it is a good idea to use the
|
108
|
-
`Wapiti::Dataset` API in Ruby: it supports all the standard set
|
109
|
-
operators and makes it very easy to combine or compare data sets.
|
110
123
|
|
111
124
|
Natural Languages used in AnyStyle
|
112
125
|
----------------------------------
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
basis for the default AnyStyle parsing model. If the references you are
|
118
|
-
trying to parse include many non-English documents, the distribution of
|
119
|
-
natural languages in this corpus is relevant (detected using [cld](https://github.com/jtoy/cld)).
|
126
|
+
The [core][] data set contains the manually marked-up references
|
127
|
+
which comprise AnyStyle's default parser model.
|
128
|
+
If your references include non-English documents,
|
129
|
+
the distribution of natural languages in this corpus is relevant.
|
120
130
|
|
121
131
|
| Language | n |
|
122
132
|
|-------------------------|-----|
|
@@ -129,42 +139,51 @@ natural languages in this corpus is relevant (detected using [cld](https://githu
|
|
129
139
|
| Not reliably determined | 449 |
|
130
140
|
| (but mainly English) | |
|
131
141
|
|
132
|
-
(
|
142
|
+
(Measured using [cld][] and AnyStyle version 1.3.13)
|
133
143
|
|
134
144
|
There is a strong prevalence of English-language documents with the
|
135
|
-
conventions used in English-language bibliographies,
|
136
|
-
representation of other European languages.
|
137
|
-
those used in scientific publishing
|
138
|
-
|
139
|
-
|
145
|
+
conventions used in English-language bibliographies,
|
146
|
+
with some representation of other European languages.
|
147
|
+
The languages used reflect those used in scientific publishing
|
148
|
+
as well as the maintainers' competencies.
|
149
|
+
If you are working with documents in languages other than English,
|
150
|
+
you might consider training the model with some examples
|
140
151
|
in the relevant languages.
|
141
152
|
|
142
|
-
AnyStyle
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
153
|
+
AnyStyle works with references written in any Latin script,
|
154
|
+
including most European languages,
|
155
|
+
languages such as Indonesian and Malaysian,
|
156
|
+
as well as romanized Arabic, Chinese and Japanese.
|
157
|
+
It also supports non-Latin alphabets such as Cyrillic,
|
158
|
+
although no examples of these appear in the default training sets.
|
159
|
+
Languages written in syllabaries or complex symbols
|
160
|
+
which don't use white space to separate tokens
|
161
|
+
aren't compatible with AnyStyle's approach:
|
162
|
+
this includes Chinese, Japanese, Arabic, and Indian languages.
|
163
|
+
|
164
|
+
[cld]: https://github.com/jtoy/cld
|
165
|
+
|
151
166
|
|
152
167
|
Dictionary Adapters
|
153
168
|
-------------------
|
154
|
-
During the statistical analysis of reference strings,
|
155
|
-
on a large feature dictionary;
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
169
|
+
During the statistical analysis of reference strings,
|
170
|
+
AnyStyle relies on a large feature dictionary;
|
171
|
+
by default, AnyStyle creates a persistent Ruby hash
|
172
|
+
in the folder of the `anystyle-data` Gem.
|
173
|
+
This uses up about 2MB of disk space
|
174
|
+
and keeps the entire dictionary in memory.
|
175
|
+
If you prefer a smaller memory footprint,
|
176
|
+
you can use AnyStyle's GDBM dictionary.
|
177
|
+
GDBM bindings are part of the Ruby standard library
|
178
|
+
and supported on all platforms,
|
179
|
+
though you may need to install GDBM before installing Ruby.
|
180
|
+
|
181
|
+
If you don't want to use the persistent Ruby hash nor GBDM,
|
182
|
+
you can store your dictionary in memory or use a Redis.
|
183
|
+
The best way to change the default dictionary adapter
|
184
|
+
is by adjusting AnyStyle's default configuration
|
185
|
+
(when using the static parser instances
|
186
|
+
you must set the default before using the parser):
|
168
187
|
|
169
188
|
AnyStyle::Dictionary.defaults[:adapter] = :ruby
|
170
189
|
#-> Use a persistent Ruby hash;
|
@@ -186,34 +205,39 @@ and configure AnyStyle to use the Redis adapter:
|
|
186
205
|
AnyStyle::Dictionary::Redis.defaults[:host] = 'localhost'
|
187
206
|
AnyStyle::Dictionary::Redis.defaults[:port] = 6379
|
188
207
|
|
208
|
+
|
189
209
|
About AnyStyle
|
190
210
|
==============
|
191
211
|
Contributing
|
192
212
|
------------
|
193
|
-
The AnyStyle source code is
|
194
|
-
[hosted on GitHub](https://github.com/inukshuk/anystyle/).
|
213
|
+
The AnyStyle source code is hosted on [GitHub][].
|
195
214
|
You can check out a copy of the latest code using Git:
|
196
215
|
|
197
216
|
$ git clone https://github.com/inukshuk/anystyle.git
|
198
217
|
|
199
|
-
If you've found a bug or have a question,
|
200
|
-
[
|
201
|
-
|
202
|
-
example, fix the bug and submit a pull request.
|
218
|
+
If you've found a bug or have a question,
|
219
|
+
please [report the issue][] or,
|
220
|
+
for extra credit, clone the AnyStyle repository,
|
221
|
+
write a failing example, fix the bug and submit a pull request.
|
222
|
+
|
223
|
+
[GitHub]: https://github.com/inukshuk/anystyle/
|
224
|
+
[report the issue]: https://github.com/inukshuk/anystyle/issues
|
225
|
+
|
203
226
|
|
204
227
|
Credits
|
205
228
|
-------
|
206
|
-
AnyStyle is a volunteer effort and
|
207
|
-
|
229
|
+
AnyStyle is a volunteer effort and you're encourage to join!
|
230
|
+
Over the years the main contributors have been:
|
208
231
|
|
209
232
|
* [Alex Fenton](https://github.com/a-fent)
|
210
233
|
* [Sylvester Keil](https://github.com/inukshuk)
|
211
234
|
* [Johannes Krtek](https://github.com/flachware)
|
212
235
|
* [Ilja Srna](https://github.com/namyra)
|
213
236
|
|
237
|
+
|
214
238
|
License
|
215
239
|
-------
|
216
240
|
Copyright 2011-2023 Sylvester Keil. All rights reserved.
|
217
241
|
|
218
242
|
AnyStyle is distributed under a BSD-style license.
|
219
|
-
See LICENSE for details.
|
243
|
+
See [LICENSE](./LICENSE) for details.
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
module AnyStyle
|
2
3
|
class Feature
|
3
4
|
class Keyword < Feature
|
@@ -12,7 +13,7 @@ module AnyStyle
|
|
12
13
|
/^(compilador)$/i,
|
13
14
|
/編/
|
14
15
|
:editor
|
15
|
-
when
|
16
|
+
when /著|撰/
|
16
17
|
:author
|
17
18
|
when /^trans(l(ated|ators?|ation))?$/i,
|
18
19
|
/^übers(etz(t|ung))?$/i,
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module Format
|
3
|
+
module RIS
|
4
|
+
def format_ris(dataset, **opts)
|
5
|
+
format_hash(dataset).map { |entry| format_entry(entry) }.join("\n\n") + "\n"
|
6
|
+
end
|
7
|
+
|
8
|
+
def format_entry(entry)
|
9
|
+
lines = []
|
10
|
+
|
11
|
+
type = ris_type(entry[:type])
|
12
|
+
lines << "TY - #{type}"
|
13
|
+
|
14
|
+
add_authors(lines, entry[:author])
|
15
|
+
lines << "PY - #{unwrap(entry[:issued])}" if entry[:issued]
|
16
|
+
lines << "TI - #{unwrap(entry[:title])}" if entry[:title]
|
17
|
+
lines << "T2 - #{unwrap(entry[:'container-title'])}" if entry[:'container-title']
|
18
|
+
lines << "PB - #{unwrap(entry[:publisher])}" if entry[:publisher]
|
19
|
+
lines << "SN - #{unwrap(entry[:ISBN] || entry[:ISSN])}" if entry[:ISBN] || entry[:ISSN]
|
20
|
+
lines << "DO - #{unwrap(entry[:DOI])}" if entry[:DOI]
|
21
|
+
lines << "UR - #{unwrap(entry[:URL])}" if entry[:URL]
|
22
|
+
lines << "ET - #{unwrap(entry[:edition])}" if entry[:edition]
|
23
|
+
lines << "CY - #{unwrap(entry[:'publisher-place'] || entry[:location])}" if entry[:'publisher-place'] || entry[:location]
|
24
|
+
lines << "VL - #{unwrap(entry[:volume])}" if entry[:volume]
|
25
|
+
lines << "IS - #{unwrap(entry[:issue])}" if entry[:issue]
|
26
|
+
lines << "SP - #{unwrap(entry[:page].to_s.split('-')[0])}" if entry[:page]
|
27
|
+
lines << "EP - #{unwrap(entry[:page].to_s.split('-')[1])}" if entry[:page]&.include?("-")
|
28
|
+
lines << "ER -"
|
29
|
+
|
30
|
+
lines.join("\n")
|
31
|
+
end
|
32
|
+
|
33
|
+
# Extended RIS type mapping
|
34
|
+
def ris_type(type)
|
35
|
+
case type.to_s.downcase
|
36
|
+
when 'book' then 'BOOK' # Book
|
37
|
+
when 'chapter' then 'CHAP' # Book chapter
|
38
|
+
when 'article-journal' then 'JOUR' # Journal article
|
39
|
+
when 'magazine-article', 'magazine' then 'MGZN' # Magazine
|
40
|
+
when 'newspaper-article', 'news' then 'NEWS' # Newspaper
|
41
|
+
when 'conference-paper', 'proceedings-article' then 'CONF' # Conference
|
42
|
+
when 'manuscript' then 'UNPB' # Unpublished
|
43
|
+
when 'thesis' then 'THES' # Thesis/dissertation
|
44
|
+
when 'webpage', 'electronic', 'online' then 'ELEC' # Electronic source
|
45
|
+
when 'film' then 'MPCT' # Motion picture
|
46
|
+
when 'report' then 'RPRT' # Technical report
|
47
|
+
else 'GEN' # Generic fallback
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def unwrap(val)
|
52
|
+
val.is_a?(Array) ? val.first : val
|
53
|
+
end
|
54
|
+
|
55
|
+
def add_authors(lines, authors)
|
56
|
+
return unless authors
|
57
|
+
|
58
|
+
authors.each do |author|
|
59
|
+
name = if author[:literal]
|
60
|
+
author[:literal]
|
61
|
+
elsif author[:family] || author[:given]
|
62
|
+
family = author[:family]
|
63
|
+
given = author[:given]&.gsub('.', '')
|
64
|
+
|
65
|
+
# Add space between adjacent uppercase initials (e.g., "HJ" => "H J")
|
66
|
+
given = given.gsub(/(?<=\A|\s)([A-Z])(?=[A-Z])/, '\1 ') if given
|
67
|
+
|
68
|
+
[family, given].compact.join(', ')
|
69
|
+
else
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
|
73
|
+
lines << "AU - #{name}" if name
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -1,11 +1,13 @@
|
|
1
1
|
module AnyStyle
|
2
|
-
maybe_require '
|
2
|
+
maybe_require 'cld3'
|
3
3
|
maybe_require 'unicode/scripts'
|
4
4
|
|
5
5
|
class Normalizer
|
6
6
|
class Locale < Normalizer
|
7
7
|
def initialize
|
8
|
-
|
8
|
+
if defined?(::CLD3)
|
9
|
+
@ld = ::CLD3::NNetLanguageIdentifier.new(0, 1000)
|
10
|
+
end
|
9
11
|
end
|
10
12
|
|
11
13
|
def normalize(item, **opts)
|
@@ -24,14 +26,16 @@ module AnyStyle
|
|
24
26
|
language = detect_language(sample)
|
25
27
|
scripts = detect_scripts(sample)
|
26
28
|
|
27
|
-
item[:language] ||= language unless language.nil?
|
29
|
+
item[:language] ||= language.to_s unless language.nil?
|
28
30
|
item[:scripts] ||= scripts unless scripts.nil?
|
29
31
|
item
|
30
32
|
end
|
31
33
|
end
|
32
34
|
|
33
35
|
def detect_language(string)
|
34
|
-
@ld
|
36
|
+
if instance_variable_defined?('@ld') && string.length > 8
|
37
|
+
@ld.find_language(string).language
|
38
|
+
end
|
35
39
|
end
|
36
40
|
|
37
41
|
def detect_scripts(string)
|
data/lib/anystyle/parser.rb
CHANGED
@@ -96,8 +96,9 @@ module AnyStyle
|
|
96
96
|
class Parser < ParserCore
|
97
97
|
include Format::BibTeX
|
98
98
|
include Format::CSL
|
99
|
+
include Format::RIS
|
99
100
|
|
100
|
-
@formats = [:bibtex, :citeproc, :csl, :hash, :wapiti]
|
101
|
+
@formats = [:bibtex, :citeproc, :csl, :hash, :wapiti, :ris]
|
101
102
|
|
102
103
|
@defaults = {
|
103
104
|
model: File.join(SUPPORT, 'parser.mod'),
|
@@ -105,7 +106,7 @@ module AnyStyle
|
|
105
106
|
compact: true,
|
106
107
|
threads: 4,
|
107
108
|
separator: /(?:\r?\n)+/,
|
108
|
-
delimiter:
|
109
|
+
delimiter: /(\s|\p{Space_Separator})+|([\uFF01-\uFF64]|。|、)/,
|
109
110
|
format: :hash,
|
110
111
|
training_data: File.join(RES, 'parser', 'core.xml')
|
111
112
|
}
|
@@ -190,7 +191,7 @@ module AnyStyle
|
|
190
191
|
case format.to_sym
|
191
192
|
when :wapiti
|
192
193
|
label(input, **opts)
|
193
|
-
when :hash, :bibtex, :citeproc, :csl
|
194
|
+
when :hash, :bibtex, :citeproc, :csl, :ris
|
194
195
|
formatter = "format_#{format}".to_sym
|
195
196
|
send(formatter, label(input, **opts), **opts)
|
196
197
|
else
|
data/lib/anystyle/version.rb
CHANGED
data/lib/anystyle.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-05-11 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: bibtex-ruby
|
@@ -77,6 +76,7 @@ extra_rdoc_files:
|
|
77
76
|
- LICENSE
|
78
77
|
files:
|
79
78
|
- ".gitattributes"
|
79
|
+
- ".github/workflows/ci.yml"
|
80
80
|
- HISTORY.md
|
81
81
|
- LICENSE
|
82
82
|
- README.md
|
@@ -108,6 +108,7 @@ files:
|
|
108
108
|
- lib/anystyle/finder.rb
|
109
109
|
- lib/anystyle/format/bibtex.rb
|
110
110
|
- lib/anystyle/format/csl.rb
|
111
|
+
- lib/anystyle/format/ris.rb
|
111
112
|
- lib/anystyle/normalizer.rb
|
112
113
|
- lib/anystyle/normalizer/arxiv.rb
|
113
114
|
- lib/anystyle/normalizer/brackets.rb
|
@@ -161,7 +162,6 @@ homepage: http://anystyle.io
|
|
161
162
|
licenses:
|
162
163
|
- BSD-2-Clause
|
163
164
|
metadata: {}
|
164
|
-
post_install_message:
|
165
165
|
rdoc_options:
|
166
166
|
- "--line-numbers"
|
167
167
|
- "--inline-source"
|
@@ -182,8 +182,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
182
182
|
- !ruby/object:Gem::Version
|
183
183
|
version: '0'
|
184
184
|
requirements: []
|
185
|
-
rubygems_version: 3.
|
186
|
-
signing_key:
|
185
|
+
rubygems_version: 3.6.6
|
187
186
|
specification_version: 4
|
188
187
|
summary: Smart and fast bibliography parser.
|
189
188
|
test_files: []
|