RubyGems - anystyle - Versions diffs - 1.4.2 → 1.5.0 - Mend

anystyle 1.4.2 → 1.5.0

Files changed (8) hide show

checksums.yaml +4 -4
data/.github/workflows/ci.yml +52 -0
data/HISTORY.md +11 -11
data/README.md +125 -101
data/lib/anystyle/normalizer/locale.rb +8 -4
data/lib/anystyle/parser.rb +1 -1
data/lib/anystyle/version.rb +1 -1
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5dd18e66e792426e7c6101a17785637f70b63b5384afe7db81406c1cebfe8708
-  data.tar.gz: 1aada4ba80830fee3350e58f2eca4d9efddef8956864181b96983e54f41dcb9c
+  metadata.gz: 68c4f52efb0ab5b4c9ba610afeb4c63d3c6db23fd5bbaa8be3e7d587f8051d6e
+  data.tar.gz: 713f8279d23f4f9338c34073a750767ec061e92e47511ee7569694fb462fa6b2
 SHA512:
-  metadata.gz: 3b0bf084efbd329498bbfecae8ee2c24be0ab3335f7540c7c2d59ab79fc3d163c099cc8e27683e5fd31f7a21421ee09e44eedd835dd0688c40f8f477ca457c3a
-  data.tar.gz: e17bb0d4745aa5947d100e81959b95bc515f697fa55f3f6500386a2231deda34858e161f63b8ce94fd53b682578997648a6db598c6396e8da5e0d3eff1d3e6c3
+  metadata.gz: a96358d0ca9731516e0ade60bd1089aa8d0547c97b88e89049b0d031b7847d66785e2a448b6966fda0bb1965190978098671f611cc79323c06a33f50562c9b7e
+  data.tar.gz: cf536f64d27601009c1f9270c7b13c5a09b683b20acd4315042d782418afa100699de5589a4c64f06bf499c631ff4396fc1854094087361f928c157c06d1e745

data/.github/workflows/ci.yml ADDED Viewed

@@ -0,0 +1,52 @@
+name: CI
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  ci:
+    name: ${{ matrix.ruby-version }} ${{ matrix.friendlyName }}-${{ matrix.arch }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        ruby-version:
+          - "2.7"
+          - "3.0"
+          - "3.1"
+          - "3.2"
+          - "3.3"
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+        arch:
+          - x64
+        include:
+          - os: ubuntu-latest
+            friendlyName: Linux
+          - os: macos-latest
+            friendlyName: macOS
+          - os: windows-latest
+            friendlyName: Windows
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Setup Ruby and install bundle
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby-version }}
+          bundler-cache: true
+      - name: Compile and run test
+        run: bundle exec rake
+      - name: Upload coverage results
+        if: matrix.ruby-version == '3.2'
+        continue-on-error: true
+        uses: coverallsapp/github-action@v2
+        with:
+          github-token: ${{ github.token }}

data/HISTORY.md CHANGED Viewed

@@ -1,6 +1,6 @@
 1.4.0 / 2023-01-06
 ==================
-* Removed deprectate string taint checking (@bbonamin).
+* Removed deprecated string taint checking (@bbonamin).
 * `AnyStyle::Parser#parse` will no longer automatically open local files.
   Please call `Wapiti::Dataset.open` explicitly if you relied on this.
@@ -17,7 +17,7 @@
 ==================
 * Updated and improved normalizers and CSL format.
 * Improved Chinese reference tokenization.
-* Added option to customizee pdftotext path.
+* Added option to customize pdftotext path.
 * Improved Finder reference line joining.
 * Improved Finder model; training sets.
 * Improved Parser model; training sets.
@@ -41,15 +41,15 @@
 1.0.1 / 2018-06-06
 ==================
-* Initial 1.0 release! This release is not backwards compatible to the
-  0.x branch. The new release uses the `AnyStyle` module and can be
-  installed using the `anystyle` Gem. The 0.x branch used the `Anystyle`
-  module and can still be installed using the `anystyle-parser` Gem but
-  will not be maintained any longer.
-* Includes vastly improved parser model and training sets.
-* Based on updated `wapiti-ruby` which builds on Linux, macOS, and
-  Windows platforms (thanks @a-fent and @WouterJeuris).
-* Flexible normalizer architecture (normalizers can be skipped individually).
+* Initial 1.0 release!
+    This release isn't backwards compatible to the 0.x branch.
+    The new release uses the `AnyStyle` module via the `anystyle` Gem.
+    The old 0.x branch used the `Anystyle` module via the`anystyle-parser` Gem
+    but isn't maintained any longer.
+* Includes improved parser model and training sets.
+* Based on updated `wapiti-ruby` which builds on Linux, macOS, and Windows platforms
+    (thanks @a-fent and @WouterJeuris).
+* Flexible normalizer architecture (you can skip individual normalizers).
 * Improved feature architecture.
 * Improved input/output via Wapiti::Dataset.
 * New default dictionary adapter (thanks @a-fent).

data/README.md CHANGED Viewed

@@ -1,24 +1,24 @@
 AnyStyle
 ========
-[![Build Status](https://travis-ci.org/inukshuk/anystyle.svg?branch=master)](https://travis-ci.org/inukshuk/anystyle)
-[![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/anystyle?branch=master)
+[![CI](https://github.com/inukshuk/anystyle/actions/workflows/ci.yml/badge.svg)](https://github.com/inukshuk/anystyle/actions/workflows/ci.yml)
+[![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle/badge.svg)](https://coveralls.io/github/inukshuk/anystyle)
-AnyStyle is a very fast and smart parser for academic references. It
-was originally inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/)
-and [FreeCite](http://freecite.library.brown.edu/); AnyStyle uses machine
-learning algorithms and aims to make it easy to train the model with data
-that is relevant to your parsing needs.
+AnyStyle is a fast and smart parser of bibliographic references.
+Originally inspired by [parsCit][] and [FreeCite][],
+AnyStyle uses machine learning algorithms
+and aims to make it easy to train models
+with data that's relevant to you.
-Using AnyStyle CLI
-------------------
+Using AnyStyle on the command line
+----------------------------------
     $ [sudo] gem install anystyle-cli
     $ anystyle --help
     $ anystyle help find
     $ anystyle help parse
-See [anystyle-cli](https://github.com/inukshuk/anystyle-cli) for more details.
+See [anystyle-cli][] for more details.
 Using AnyStyle in Ruby
 ----------------------
@@ -26,8 +26,9 @@ Install the `anystyle` gem.
     $ [sudo] gem install anystyle
-Once installed, you can use the static Parser and Finder instances
-by calling the `AnyStyle.parse` or `AnyStyle.find` methods. For example:
+Now you can use the static Parser and Finder instances
+by calling the `AnyStyle.parse` or `AnyStyle.find` methods.
+For example:
 ```ruby
 require 'anystyle'
@@ -46,77 +47,86 @@ pp AnyStyle.parse 'Derrida, J. (1967). L’écriture et la différence (1 éd.).
 #}]
 ```
-Alternatively, you can create your own `AnyStyle::Parser` or
-`AnyStyle::Finder` with custom options.
+You can also create your own
+`AnyStyle::Parser` or `AnyStyle::Finder` with custom options.
-Using the AnyStyle Web App
---------------------------
-AnyStyle is available as web application at [anystyle.io](https://anystyle.io).
+Using AnyStyle on the web
+-------------------------
+AnyStyle is available at [anystyle.io][].
-The web application [is open source](https://github.com/inukshuk/anystyle.io)
-and you can also host yourself!
+The web application is [open source][]
+and you're welcome to host your own instance!
-Improving results for your data
-=================================
+[anystyle-cli]: https://github.com/inukshuk/anystyle-cli
+[anystyle.io]: https://anystyle.io
+[open source]: https://github.com/inukshuk/anystyle.io
+[parsCit]: http://aye.comp.nus.edu.sg/parsCit/
+[FreeCite]: http://freecite.library.brown.edu/
+Improving results for your data
+===============================
 Training
 --------
-You can train custom Finder and Parser models. To do this, you need
-to prepare your own data sets for training. You can create your own
-data from scratch or build on AnyStyle's default sets. The default
-parser model is based on the
-[core](https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml)
-data set; the default finder model source data is not publicly
-available in its entirety, but you can find a number of tagged
-documents
-[here](https://github.com/inukshuk/anystyle/blob/master/res/finder).
-When you have compiled a data set for training, you will be ready
-to create your own model:
+You can train custom Finder and Parser models.
+To do this, you need to prepare your own data sets for training.
+You can create your own data from scratch
+or build on AnyStyle's default sets.
+The default parser model uses the [core][] data set.
+And though the finder model sources aren't available in their entirety,
+due to copyright restrictions,
+you can find several [tagged documents][] here.
+When you have compiled a data set for training,
+you will be ready to create your own model:
     $ anystyle train training-data.xml custom.mod
-This will save your new model as `custom.mod`. To use your model
-instead of AnyStyle's default, use the `-P` or `--parser-model` flag
-and, respectively, `-F` or `--finder-model` to use a custom Finder
-model. For instance, the command below would parse all references
-in `bib.txt` using the custom model we just trained and print the
-result to STDOUT using the JSON output format:
+This will save your new model as `custom.mod`.
+To use your model instead of AnyStyle's default,
+use the `-P` or `--parser-model` flag and, respectively,
+`-F` or `--finder-model` to use a custom finder model.
+For instance, the command below
+will parse a file `bib.txt` with the custom model
+and print the result to STDOUT in JSON format:
     $ anystyle -P custom.mod -f json parse bib.txt -
-When training your own models, it is good practice to check the
-quality using a second data set. For example, using AnyStyle's own
-[gold](https://github.com/inukshuk/anystyle/blob/master/res/parser/gold.xml)
-data set (a large, manually curated data set) we could check our
-custom model like this:
+When training your own models, it's good practice
+to check their quality using a second data set.
+For example, to check your custom model
+using AnyStyle's manually curated [gold][] data set:
     $ anystyle -P x.mod check ./res/parser/gold.xml
     Checking gold.xml.................   1 seq  0.06%   3 tok  0.01%  3s
-This command will print the sequence and token error rates; in
-the case of AnyStyle a the number of sequence errors is the number
-of references which were tagged differently by the parser than they
-were in the input; the number of token errors is the total number of
-words across all the references which were tagged differently. In the
-example above, we got one reference wrong (out of 1700 at the time);
-but even this one reference was mostly tagged correctly, because only
-a total of 3 words were tagged differently.
+This command prints sequence and token error rates.
+Here, sequence errors are the number of references
+tagged differently by the parser
+as compared to the curated input;
+the number of token errors
+is the total number of words in these references.
+In the example above, one reference was wrong
+(out of 1,700 at the time),
+because a total of three words had a different tag.
+When working with training data,
+it's a good idea to use the `Wapiti::Dataset` API in Ruby:
+it supports standard set operators
+and makes it easy to combine or compare data sets.
+[core]: https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml
+[gold]: https://github.com/inukshuk/anystyle/blob/master/res/parser/gold.xml
+[tagged documents]: https://github.com/inukshuk/anystyle/blob/master/res/finder
-When working with training data, it is a good idea to use the
-`Wapiti::Dataset` API in Ruby: it supports all the standard set
-operators and makes it very easy to combine or compare data sets.
 Natural Languages used in AnyStyle
 ----------------------------------
-As mentioned above, the
-[core](https://github.com/inukshuk/anystyle/blob/master/res/parser/core.xml)
-dataset contains the manually marked-up references that are used as the
-basis for the default AnyStyle parsing model. If the references you are
-trying to parse include many non-English documents, the distribution of
-natural languages in this corpus is relevant (detected using [cld](https://github.com/jtoy/cld)).
+The [core][] data set contains the manually marked-up references
+which comprise AnyStyle's default parser model.
+If your references include non-English documents,
+the distribution of natural languages in this corpus is relevant.
 | Language                | n   |
 |-------------------------|-----|
@@ -129,42 +139,51 @@ natural languages in this corpus is relevant (detected using [cld](https://githu
 | Not reliably determined | 449 |
 | (but mainly English)    |     |
-(These data are based on AnyStyle version 1.3.13)
+(Measured using [cld][] and AnyStyle version 1.3.13)
 There is a strong prevalence of English-language documents with the
-conventions used in English-language bibliographies, with some
-representation of other European languages. The languages used reflect
-those used in scientific publishing as well as the maintainers'
-competencies. If you are working with many documents in languages other
-than English, you might consider training the model with some examples
+conventions used in English-language bibliographies,
+with some representation of other European languages.
+The languages used reflect those used in scientific publishing
+as well as the maintainers' competencies.
+If you are working with documents in languages other than English,
+you might consider training the model with some examples
 in the relevant languages.
-AnyStyle should work with references written in any Latin script
-(including most European languages, languages such as Indonesian and
-Malaysian, as well as romanised Arabic, Chinese and Japanese). It should
-also support languages written with non-Latin alphabets (such as
-Russian), although no examples of these appear in the default training
-sets. Languages written in syllabaries or complex symbols which do not
-use white space to separate tokens are not compatible with AnyStyle's
-approach: this includes Chinese, Japanese, Arabic as well as many Indian
-languages.
+AnyStyle works with references written in any Latin script,
+including most European languages,
+languages such as Indonesian and Malaysian,
+as well as romanized Arabic, Chinese and Japanese.
+It also supports non-Latin alphabets such as Cyrillic,
+although no examples of these appear in the default training sets.
+Languages written in syllabaries or complex symbols
+which don't use white space to separate tokens
+aren't compatible with AnyStyle's approach:
+this includes Chinese, Japanese, Arabic, and Indian languages.
+[cld]: https://github.com/jtoy/cld
 Dictionary Adapters
 -------------------
-During the statistical analysis of reference strings, AnyStyle relies
-on a large feature dictionary; by default, AnyStyle creates a persistent
-Ruby Hash in the folder of the `anystyle-data` Gem. This uses up about
-2MB of disk space and keeps the entire dictionary in memory. If you prefer
-a smaller memory footprint, you can alternatively use AnyStyle's GDBM
-dictionary. GDBM bindings are part of the Ruby standard library and are
-supported on all platforms, but you may have to install GDBM on your
-platform before installing Ruby.
-If you do not want to use the the persistent Ruyb Hash nor the GBDM
-bindings, you can store your dictionary in memory (not recommended) or
-use a Redis. The best way to change the default dictionary adapter is by
-adjusting AnyStyle's default configuration (when using the default parser
-instances you must set the default before using the parser):
+During the statistical analysis of reference strings,
+AnyStyle relies on a large feature dictionary;
+by default, AnyStyle creates a persistent Ruby hash
+in the folder of the `anystyle-data` Gem.
+This uses up about 2MB of disk space
+and keeps the entire dictionary in memory.
+If you prefer a smaller memory footprint,
+you can use AnyStyle's GDBM dictionary.
+GDBM bindings are part of the Ruby standard library
+and supported on all platforms,
+though you may need to install GDBM before installing Ruby.
+If you don't want to use the persistent Ruby hash nor GBDM,
+you can store your dictionary in memory or use a Redis.
+The best way to change the default dictionary adapter
+is by adjusting AnyStyle's default configuration
+(when using the static parser instances
+you must set the default before using the parser):
     AnyStyle::Dictionary.defaults[:adapter] = :ruby
     #-> Use a persistent Ruby hash;
@@ -186,34 +205,39 @@ and configure AnyStyle to use the Redis adapter:
     AnyStyle::Dictionary::Redis.defaults[:host] = 'localhost'
     AnyStyle::Dictionary::Redis.defaults[:port] = 6379
 About AnyStyle
 ==============
 Contributing
 ------------
-The AnyStyle source code is
-[hosted on GitHub](https://github.com/inukshuk/anystyle/).
+The AnyStyle source code is hosted on [GitHub][].
 You can check out a copy of the latest code using Git:
     $ git clone https://github.com/inukshuk/anystyle.git
-If you've found a bug or have a question, please open an issue on the
-[AnyStyle issue tracker](https://github.com/inukshuk/anystyle/issues).
-Or, for extra credit, clone the AnyStyle repository, write a failing
-example, fix the bug and submit a pull request.
+If you've found a bug or have a question,
+please [report the issue][] or,
+for extra credit, clone the AnyStyle repository,
+write a failing example, fix the bug and submit a pull request.
+[GitHub]: https://github.com/inukshuk/anystyle/
+[report the issue]: https://github.com/inukshuk/anystyle/issues
 Credits
 -------
-AnyStyle is a volunteer effort and we encourage you
-to join us! Over the years our main contributors have been:
+AnyStyle is a volunteer effort and you're encourage to join!
+Over the years the main contributors have been:
 * [Alex Fenton](https://github.com/a-fent)
 * [Sylvester Keil](https://github.com/inukshuk)
 * [Johannes Krtek](https://github.com/flachware)
 * [Ilja Srna](https://github.com/namyra)
 License
 -------
 Copyright 2011-2023 Sylvester Keil. All rights reserved.
 AnyStyle is distributed under a BSD-style license.
-See LICENSE for details.
+See [LICENSE](./LICENSE) for details.

data/lib/anystyle/normalizer/locale.rb CHANGED Viewed

@@ -1,11 +1,13 @@
 module AnyStyle
-  maybe_require 'language_detector'
+  maybe_require 'cld3'
   maybe_require 'unicode/scripts'
   class Normalizer
     class Locale < Normalizer
       def initialize
-        @ld = LanguageDetector.new if defined?(LanguageDetector)
+        if defined?(::CLD3)
+          @ld = ::CLD3::NNetLanguageIdentifier.new(0, 1000)
+        end
       end
       def normalize(item, **opts)
@@ -24,14 +26,16 @@ module AnyStyle
         language = detect_language(sample)
         scripts = detect_scripts(sample)
-        item[:language] ||= language unless language.nil?
+        item[:language] ||= language.to_s unless language.nil?
         item[:scripts] ||= scripts unless scripts.nil?
         item
       end
     end
     def detect_language(string)
-      @ld.detect(string) unless @ld.nil?
+      if instance_variable_defined?('@ld') && string.length > 8
+        @ld.find_language(string).language
+      end
     end
     def detect_scripts(string)

data/lib/anystyle/parser.rb CHANGED Viewed

@@ -105,7 +105,7 @@ module AnyStyle
       compact: true,
       threads: 4,
       separator: /(?:\r?\n)+/,
-      delimiter: /\s+|([\uFF01-\uFF64]|。|、)/,
+      delimiter: /(\s|\p{Space_Separator})+|([\uFF01-\uFF64]|。|、)/,
       format: :hash,
       training_data: File.join(RES, 'parser', 'core.xml')
     }

data/lib/anystyle/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module AnyStyle
-  VERSION = '1.4.2'.freeze
+  VERSION = '1.5.0'.freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: anystyle
 version: !ruby/object:Gem::Version
-  version: 1.4.2
+  version: 1.5.0
 platform: ruby
 authors:
 - Sylvester Keil
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-03-27 00:00:00.000000000 Z
+date: 2024-01-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bibtex-ruby
@@ -77,6 +77,7 @@ extra_rdoc_files:
 - LICENSE
 files:
 - ".gitattributes"
+- ".github/workflows/ci.yml"
 - HISTORY.md
 - LICENSE
 - README.md
@@ -182,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.2
+rubygems_version: 3.5.3
 signing_key:
 specification_version: 4
 summary: Smart and fast bibliography parser.