anystyle 1.4.2 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5dd18e66e792426e7c6101a17785637f70b63b5384afe7db81406c1cebfe8708
4
- data.tar.gz: 1aada4ba80830fee3350e58f2eca4d9efddef8956864181b96983e54f41dcb9c
3
+ metadata.gz: 68c4f52efb0ab5b4c9ba610afeb4c63d3c6db23fd5bbaa8be3e7d587f8051d6e
4
+ data.tar.gz: 713f8279d23f4f9338c34073a750767ec061e92e47511ee7569694fb462fa6b2
5
5
  SHA512:
6
- metadata.gz: 3b0bf084efbd329498bbfecae8ee2c24be0ab3335f7540c7c2d59ab79fc3d163c099cc8e27683e5fd31f7a21421ee09e44eedd835dd0688c40f8f477ca457c3a
7
- data.tar.gz: e17bb0d4745aa5947d100e81959b95bc515f697fa55f3f6500386a2231deda34858e161f63b8ce94fd53b682578997648a6db598c6396e8da5e0d3eff1d3e6c3
6
+ metadata.gz: a96358d0ca9731516e0ade60bd1089aa8d0547c97b88e89049b0d031b7847d66785e2a448b6966fda0bb1965190978098671f611cc79323c06a33f50562c9b7e
7
+ data.tar.gz: cf536f64d27601009c1f9270c7b13c5a09b683b20acd4315042d782418afa100699de5589a4c64f06bf499c631ff4396fc1854094087361f928c157c06d1e745
@@ -0,0 +1,52 @@
1
+ name: CI
2
+ on:
3
+ push:
4
+ branches: [ main ]
5
+ pull_request:
6
+ branches: [ main ]
7
+ concurrency:
8
+ group: ci-${{ github.ref }}
9
+ cancel-in-progress: true
10
+ jobs:
11
+ ci:
12
+ name: ${{ matrix.ruby-version }} ${{ matrix.friendlyName }}-${{ matrix.arch }}
13
+ runs-on: ${{ matrix.os }}
14
+
15
+ strategy:
16
+ matrix:
17
+ ruby-version:
18
+ - "2.7"
19
+ - "3.0"
20
+ - "3.1"
21
+ - "3.2"
22
+ - "3.3"
23
+ os:
24
+ - ubuntu-latest
25
+ - macos-latest
26
+ - windows-latest
27
+ arch:
28
+ - x64
29
+ include:
30
+ - os: ubuntu-latest
31
+ friendlyName: Linux
32
+ - os: macos-latest
33
+ friendlyName: macOS
34
+ - os: windows-latest
35
+ friendlyName: Windows
36
+
37
+ steps:
38
+ - name: Checkout repository
39
+ uses: actions/checkout@v4
40
+ - name: Setup Ruby and install bundle
41
+ uses: ruby/setup-ruby@v1
42
+ with:
43
+ ruby-version: ${{ matrix.ruby-version }}
44
+ bundler-cache: true
45
+ - name: Compile and run test
46
+ run: bundle exec rake
47
+ - name: Upload coverage results
48
+ if: matrix.ruby-version == '3.2'
49
+ continue-on-error: true
50
+ uses: coverallsapp/github-action@v2
51
+ with:
52
+ github-token: ${{ github.token }}
data/HISTORY.md CHANGED
@@ -1,6 +1,6 @@
1
1
  1.4.0 / 2023-01-06
2
2
  ==================
3
- * Removed deprectate string taint checking (@bbonamin).
3
+ * Removed deprecated string taint checking (@bbonamin).
4
4
  * `AnyStyle::Parser#parse` will no longer automatically open local files.
5
5
  Please call `Wapiti::Dataset.open` explicitly if you relied on this.
6
6
 
@@ -17,7 +17,7 @@
17
17
  ==================
18
18
  * Updated and improved normalizers and CSL format.
19
19
  * Improved Chinese reference tokenization.
20
- * Added option to customizee pdftotext path.
20
+ * Added option to customize pdftotext path.
21
21
  * Improved Finder reference line joining.
22
22
  * Improved Finder model; training sets.
23
23
  * Improved Parser model; training sets.
@@ -41,15 +41,15 @@
41
41
 
42
42
  1.0.1 / 2018-06-06
43
43
  ==================
44
- * Initial 1.0 release! This release is not backwards compatible to the
45
- 0.x branch. The new release uses the `AnyStyle` module and can be
46
- installed using the `anystyle` Gem. The 0.x branch used the `Anystyle`
47
- module and can still be installed using the `anystyle-parser` Gem but
48
- will not be maintained any longer.
49
- * Includes vastly improved parser model and training sets.
50
- * Based on updated `wapiti-ruby` which builds on Linux, macOS, and
51
- Windows platforms (thanks @a-fent and @WouterJeuris).
52
- * Flexible normalizer architecture (normalizers can be skipped individually).
44
+ * Initial 1.0 release!
45
+ This release isn't backwards compatible to the 0.x branch.
46
+ The new release uses the `AnyStyle` module via the `anystyle` Gem.
47
+ The old 0.x branch used the `Anystyle` module via the`anystyle-parser` Gem
48
+ but isn't maintained any longer.
49
+ * Includes improved parser model and training sets.
50
+ * Based on updated `wapiti-ruby` which builds on Linux, macOS, and Windows platforms
51
+ (thanks @a-fent and @WouterJeuris).
52
+ * Flexible normalizer architecture (you can skip individual normalizers).
53
53
  * Improved feature architecture.
54
54
  * Improved input/output via Wapiti::Dataset.
55
55
  * New default dictionary adapter (thanks @a-fent).
data/README.md CHANGED
@@ -1,24 +1,24 @@
1
1
  AnyStyle
2
2
  ========
3
- [![Build Status](https://travis-ci.org/inukshuk/anystyle.svg?branch=master)](https://travis-ci.org/inukshuk/anystyle)
4
- [![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/anystyle?branch=master)
3
+ [![CI](https://github.com/inukshuk/anystyle/actions/workflows/ci.yml/badge.svg)](https://github.com/inukshuk/anystyle/actions/workflows/ci.yml)
4
+ [![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle/badge.svg)](https://coveralls.io/github/inukshuk/anystyle)
5
5
 
6
- AnyStyle is a very fast and smart parser for academic references. It
7
- was originally inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/)
8
- and [FreeCite](http://freecite.library.brown.edu/); AnyStyle uses machine
9
- learning algorithms and aims to make it easy to train the model with data
10
- that is relevant to your parsing needs.
6
+ AnyStyle is a fast and smart parser of bibliographic references.
7
+ Originally inspired by [parsCit][] and [FreeCite][],
8
+ AnyStyle uses machine learning algorithms
9
+ and aims to make it easy to train models
10
+ with data that's relevant to you.
11
11
 
12
12
 
13
- Using AnyStyle CLI
14
- ------------------
15
-
13
+ Using AnyStyle on the command line
14
+ ----------------------------------
16
15
  $ [sudo] gem install anystyle-cli
17
16
  $ anystyle --help
18
17
  $ anystyle help find
19
18
  $ anystyle help parse
20
19
 
21
- See [anystyle-cli](https://github.com/inukshuk/anystyle-cli) for more details.
20
+ See [anystyle-cli][] for more details.
21
+
22
22
 
23
23
  Using AnyStyle in Ruby
24
24
  ----------------------
@@ -26,8 +26,9 @@ Install the `anystyle` gem.
26
26
 
27
27
  $ [sudo] gem install anystyle
28
28
 
29
- Once installed, you can use the static Parser and Finder instances
30
- by calling the `AnyStyle.parse` or `AnyStyle.find` methods. For example:
29
+ Now you can use the static Parser and Finder instances
30
+ by calling the `AnyStyle.parse` or `AnyStyle.find` methods.
31
+ For example:
31
32
 
32
33
  ```ruby
33
34
  require 'anystyle'
@@ -46,77 +47,86 @@ pp AnyStyle.parse 'Derrida, J. (1967). L’écriture et la différence (1 éd.).
46
47
  #}]
47
48
  ```
48
49
 
49
- Alternatively, you can create your own `AnyStyle::Parser` or
50
- `AnyStyle::Finder` with custom options.
50
+ You can also create your own
51
+ `AnyStyle::Parser` or `AnyStyle::Finder` with custom options.
51
52
 
52
53
 
53
- Using the AnyStyle Web App
54
- --------------------------
55
- AnyStyle is available as web application at [anystyle.io](https://anystyle.io).
54
+ Using AnyStyle on the web
55
+ -------------------------
56
+ AnyStyle is available at [anystyle.io][].
56
57
 
57
- The web application [is open source](https://github.com/inukshuk/anystyle.io)
58
- and you can also host yourself!
58
+ The web application is [open source][]
59
+ and you're welcome to host your own instance!
59
60
 
60
- Improving results for your data
61
- =================================
61
+ [anystyle-cli]: https://github.com/inukshuk/anystyle-cli
62
+ [anystyle.io]: https://anystyle.io
63
+ [open source]: https://github.com/inukshuk/anystyle.io
64
+ [parsCit]: http://aye.comp.nus.edu.sg/parsCit/
65
+ [FreeCite]: http://freecite.library.brown.edu/
62
66
 
67
+
68
+ Improving results for your data
69
+ ===============================
63
70
  Training
64
71
  --------
65
- You can train custom Finder and Parser models. To do this, you need
66
- to prepare your own data sets for training. You can create your own
67
- data from scratch or build on AnyStyle's default sets. The default
68
- parser model is based on the
69
- [core](https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml)
70
- data set; the default finder model source data is not publicly
71
- available in its entirety, but you can find a number of tagged
72
- documents
73
- [here](https://github.com/inukshuk/anystyle/blob/master/res/finder).
74
-
75
- When you have compiled a data set for training, you will be ready
76
- to create your own model:
72
+ You can train custom Finder and Parser models.
73
+ To do this, you need to prepare your own data sets for training.
74
+ You can create your own data from scratch
75
+ or build on AnyStyle's default sets.
76
+ The default parser model uses the [core][] data set.
77
+ And though the finder model sources aren't available in their entirety,
78
+ due to copyright restrictions,
79
+ you can find several [tagged documents][] here.
80
+
81
+ When you have compiled a data set for training,
82
+ you will be ready to create your own model:
77
83
 
78
84
  $ anystyle train training-data.xml custom.mod
79
85
 
80
- This will save your new model as `custom.mod`. To use your model
81
- instead of AnyStyle's default, use the `-P` or `--parser-model` flag
82
- and, respectively, `-F` or `--finder-model` to use a custom Finder
83
- model. For instance, the command below would parse all references
84
- in `bib.txt` using the custom model we just trained and print the
85
- result to STDOUT using the JSON output format:
86
+ This will save your new model as `custom.mod`.
87
+ To use your model instead of AnyStyle's default,
88
+ use the `-P` or `--parser-model` flag and, respectively,
89
+ `-F` or `--finder-model` to use a custom finder model.
90
+ For instance, the command below
91
+ will parse a file `bib.txt` with the custom model
92
+ and print the result to STDOUT in JSON format:
86
93
 
87
94
  $ anystyle -P custom.mod -f json parse bib.txt -
88
95
 
89
- When training your own models, it is good practice to check the
90
- quality using a second data set. For example, using AnyStyle's own
91
- [gold](https://github.com/inukshuk/anystyle/blob/master/res/parser/gold.xml)
92
- data set (a large, manually curated data set) we could check our
93
- custom model like this:
96
+ When training your own models, it's good practice
97
+ to check their quality using a second data set.
98
+ For example, to check your custom model
99
+ using AnyStyle's manually curated [gold][] data set:
94
100
 
95
101
  $ anystyle -P x.mod check ./res/parser/gold.xml
96
102
  Checking gold.xml................. 1 seq 0.06% 3 tok 0.01% 3s
97
103
 
98
- This command will print the sequence and token error rates; in
99
- the case of AnyStyle a the number of sequence errors is the number
100
- of references which were tagged differently by the parser than they
101
- were in the input; the number of token errors is the total number of
102
- words across all the references which were tagged differently. In the
103
- example above, we got one reference wrong (out of 1700 at the time);
104
- but even this one reference was mostly tagged correctly, because only
105
- a total of 3 words were tagged differently.
104
+ This command prints sequence and token error rates.
105
+ Here, sequence errors are the number of references
106
+ tagged differently by the parser
107
+ as compared to the curated input;
108
+ the number of token errors
109
+ is the total number of words in these references.
110
+ In the example above, one reference was wrong
111
+ (out of 1,700 at the time),
112
+ because a total of three words had a different tag.
113
+
114
+ When working with training data,
115
+ it's a good idea to use the `Wapiti::Dataset` API in Ruby:
116
+ it supports standard set operators
117
+ and makes it easy to combine or compare data sets.
118
+
119
+ [core]: https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml
120
+ [gold]: https://github.com/inukshuk/anystyle/blob/master/res/parser/gold.xml
121
+ [tagged documents]: https://github.com/inukshuk/anystyle/blob/master/res/finder
106
122
 
107
- When working with training data, it is a good idea to use the
108
- `Wapiti::Dataset` API in Ruby: it supports all the standard set
109
- operators and makes it very easy to combine or compare data sets.
110
123
 
111
124
  Natural Languages used in AnyStyle
112
125
  ----------------------------------
113
-
114
- As mentioned above, the
115
- [core](https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml)
116
- dataset contains the manually marked-up references that are used as the
117
- basis for the default AnyStyle parsing model. If the references you are
118
- trying to parse include many non-English documents, the distribution of
119
- natural languages in this corpus is relevant (detected using [cld](https://github.com/jtoy/cld)).
126
+ The [core][] data set contains the manually marked-up references
127
+ which comprise AnyStyle's default parser model.
128
+ If your references include non-English documents,
129
+ the distribution of natural languages in this corpus is relevant.
120
130
 
121
131
  | Language | n |
122
132
  |-------------------------|-----|
@@ -129,42 +139,51 @@ natural languages in this corpus is relevant (detected using [cld](https://githu
129
139
  | Not reliably determined | 449 |
130
140
  | (but mainly English) | |
131
141
 
132
- (These data are based on AnyStyle version 1.3.13)
142
+ (Measured using [cld][] and AnyStyle version 1.3.13)
133
143
 
134
144
  There is a strong prevalence of English-language documents with the
135
- conventions used in English-language bibliographies, with some
136
- representation of other European languages. The languages used reflect
137
- those used in scientific publishing as well as the maintainers'
138
- competencies. If you are working with many documents in languages other
139
- than English, you might consider training the model with some examples
145
+ conventions used in English-language bibliographies,
146
+ with some representation of other European languages.
147
+ The languages used reflect those used in scientific publishing
148
+ as well as the maintainers' competencies.
149
+ If you are working with documents in languages other than English,
150
+ you might consider training the model with some examples
140
151
  in the relevant languages.
141
152
 
142
- AnyStyle should work with references written in any Latin script
143
- (including most European languages, languages such as Indonesian and
144
- Malaysian, as well as romanised Arabic, Chinese and Japanese). It should
145
- also support languages written with non-Latin alphabets (such as
146
- Russian), although no examples of these appear in the default training
147
- sets. Languages written in syllabaries or complex symbols which do not
148
- use white space to separate tokens are not compatible with AnyStyle's
149
- approach: this includes Chinese, Japanese, Arabic as well as many Indian
150
- languages.
153
+ AnyStyle works with references written in any Latin script,
154
+ including most European languages,
155
+ languages such as Indonesian and Malaysian,
156
+ as well as romanized Arabic, Chinese and Japanese.
157
+ It also supports non-Latin alphabets such as Cyrillic,
158
+ although no examples of these appear in the default training sets.
159
+ Languages written in syllabaries or complex symbols
160
+ which don't use white space to separate tokens
161
+ aren't compatible with AnyStyle's approach:
162
+ this includes Chinese, Japanese, Arabic, and Indian languages.
163
+
164
+ [cld]: https://github.com/jtoy/cld
165
+
151
166
 
152
167
  Dictionary Adapters
153
168
  -------------------
154
- During the statistical analysis of reference strings, AnyStyle relies
155
- on a large feature dictionary; by default, AnyStyle creates a persistent
156
- Ruby Hash in the folder of the `anystyle-data` Gem. This uses up about
157
- 2MB of disk space and keeps the entire dictionary in memory. If you prefer
158
- a smaller memory footprint, you can alternatively use AnyStyle's GDBM
159
- dictionary. GDBM bindings are part of the Ruby standard library and are
160
- supported on all platforms, but you may have to install GDBM on your
161
- platform before installing Ruby.
162
-
163
- If you do not want to use the the persistent Ruyb Hash nor the GBDM
164
- bindings, you can store your dictionary in memory (not recommended) or
165
- use a Redis. The best way to change the default dictionary adapter is by
166
- adjusting AnyStyle's default configuration (when using the default parser
167
- instances you must set the default before using the parser):
169
+ During the statistical analysis of reference strings,
170
+ AnyStyle relies on a large feature dictionary;
171
+ by default, AnyStyle creates a persistent Ruby hash
172
+ in the folder of the `anystyle-data` Gem.
173
+ This uses up about 2MB of disk space
174
+ and keeps the entire dictionary in memory.
175
+ If you prefer a smaller memory footprint,
176
+ you can use AnyStyle's GDBM dictionary.
177
+ GDBM bindings are part of the Ruby standard library
178
+ and supported on all platforms,
179
+ though you may need to install GDBM before installing Ruby.
180
+
181
+ If you don't want to use the persistent Ruby hash nor GBDM,
182
+ you can store your dictionary in memory or use a Redis.
183
+ The best way to change the default dictionary adapter
184
+ is by adjusting AnyStyle's default configuration
185
+ (when using the static parser instances
186
+ you must set the default before using the parser):
168
187
 
169
188
  AnyStyle::Dictionary.defaults[:adapter] = :ruby
170
189
  #-> Use a persistent Ruby hash;
@@ -186,34 +205,39 @@ and configure AnyStyle to use the Redis adapter:
186
205
  AnyStyle::Dictionary::Redis.defaults[:host] = 'localhost'
187
206
  AnyStyle::Dictionary::Redis.defaults[:port] = 6379
188
207
 
208
+
189
209
  About AnyStyle
190
210
  ==============
191
211
  Contributing
192
212
  ------------
193
- The AnyStyle source code is
194
- [hosted on GitHub](https://github.com/inukshuk/anystyle/).
213
+ The AnyStyle source code is hosted on [GitHub][].
195
214
  You can check out a copy of the latest code using Git:
196
215
 
197
216
  $ git clone https://github.com/inukshuk/anystyle.git
198
217
 
199
- If you've found a bug or have a question, please open an issue on the
200
- [AnyStyle issue tracker](https://github.com/inukshuk/anystyle/issues).
201
- Or, for extra credit, clone the AnyStyle repository, write a failing
202
- example, fix the bug and submit a pull request.
218
+ If you've found a bug or have a question,
219
+ please [report the issue][] or,
220
+ for extra credit, clone the AnyStyle repository,
221
+ write a failing example, fix the bug and submit a pull request.
222
+
223
+ [GitHub]: https://github.com/inukshuk/anystyle/
224
+ [report the issue]: https://github.com/inukshuk/anystyle/issues
225
+
203
226
 
204
227
  Credits
205
228
  -------
206
- AnyStyle is a volunteer effort and we encourage you
207
- to join us! Over the years our main contributors have been:
229
+ AnyStyle is a volunteer effort and you're encourage to join!
230
+ Over the years the main contributors have been:
208
231
 
209
232
  * [Alex Fenton](https://github.com/a-fent)
210
233
  * [Sylvester Keil](https://github.com/inukshuk)
211
234
  * [Johannes Krtek](https://github.com/flachware)
212
235
  * [Ilja Srna](https://github.com/namyra)
213
236
 
237
+
214
238
  License
215
239
  -------
216
240
  Copyright 2011-2023 Sylvester Keil. All rights reserved.
217
241
 
218
242
  AnyStyle is distributed under a BSD-style license.
219
- See LICENSE for details.
243
+ See [LICENSE](./LICENSE) for details.
@@ -1,11 +1,13 @@
1
1
  module AnyStyle
2
- maybe_require 'language_detector'
2
+ maybe_require 'cld3'
3
3
  maybe_require 'unicode/scripts'
4
4
 
5
5
  class Normalizer
6
6
  class Locale < Normalizer
7
7
  def initialize
8
- @ld = LanguageDetector.new if defined?(LanguageDetector)
8
+ if defined?(::CLD3)
9
+ @ld = ::CLD3::NNetLanguageIdentifier.new(0, 1000)
10
+ end
9
11
  end
10
12
 
11
13
  def normalize(item, **opts)
@@ -24,14 +26,16 @@ module AnyStyle
24
26
  language = detect_language(sample)
25
27
  scripts = detect_scripts(sample)
26
28
 
27
- item[:language] ||= language unless language.nil?
29
+ item[:language] ||= language.to_s unless language.nil?
28
30
  item[:scripts] ||= scripts unless scripts.nil?
29
31
  item
30
32
  end
31
33
  end
32
34
 
33
35
  def detect_language(string)
34
- @ld.detect(string) unless @ld.nil?
36
+ if instance_variable_defined?('@ld') && string.length > 8
37
+ @ld.find_language(string).language
38
+ end
35
39
  end
36
40
 
37
41
  def detect_scripts(string)
@@ -105,7 +105,7 @@ module AnyStyle
105
105
  compact: true,
106
106
  threads: 4,
107
107
  separator: /(?:\r?\n)+/,
108
- delimiter: /\s+|([\uFF01-\uFF64]|。|、)/,
108
+ delimiter: /(\s|\p{Space_Separator})+|([\uFF01-\uFF64]|。|、)/,
109
109
  format: :hash,
110
110
  training_data: File.join(RES, 'parser', 'core.xml')
111
111
  }
@@ -1,3 +1,3 @@
1
1
  module AnyStyle
2
- VERSION = '1.4.2'.freeze
2
+ VERSION = '1.5.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.2
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sylvester Keil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-27 00:00:00.000000000 Z
11
+ date: 2024-01-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bibtex-ruby
@@ -77,6 +77,7 @@ extra_rdoc_files:
77
77
  - LICENSE
78
78
  files:
79
79
  - ".gitattributes"
80
+ - ".github/workflows/ci.yml"
80
81
  - HISTORY.md
81
82
  - LICENSE
82
83
  - README.md
@@ -182,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
183
  - !ruby/object:Gem::Version
183
184
  version: '0'
184
185
  requirements: []
185
- rubygems_version: 3.4.2
186
+ rubygems_version: 3.5.3
186
187
  signing_key:
187
188
  specification_version: 4
188
189
  summary: Smart and fast bibliography parser.