anystyle 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.md +78 -0
  3. data/LICENSE +27 -0
  4. data/README.md +103 -0
  5. data/lib/anystyle.rb +71 -0
  6. data/lib/anystyle/dictionary.rb +132 -0
  7. data/lib/anystyle/dictionary/gdbm.rb +52 -0
  8. data/lib/anystyle/dictionary/lmdb.rb +67 -0
  9. data/lib/anystyle/dictionary/marshal.rb +27 -0
  10. data/lib/anystyle/dictionary/redis.rb +55 -0
  11. data/lib/anystyle/document.rb +264 -0
  12. data/lib/anystyle/errors.rb +14 -0
  13. data/lib/anystyle/feature.rb +27 -0
  14. data/lib/anystyle/feature/affix.rb +43 -0
  15. data/lib/anystyle/feature/brackets.rb +32 -0
  16. data/lib/anystyle/feature/canonical.rb +13 -0
  17. data/lib/anystyle/feature/caps.rb +20 -0
  18. data/lib/anystyle/feature/category.rb +70 -0
  19. data/lib/anystyle/feature/dictionary.rb +16 -0
  20. data/lib/anystyle/feature/indent.rb +16 -0
  21. data/lib/anystyle/feature/keyword.rb +52 -0
  22. data/lib/anystyle/feature/line.rb +39 -0
  23. data/lib/anystyle/feature/locator.rb +18 -0
  24. data/lib/anystyle/feature/number.rb +39 -0
  25. data/lib/anystyle/feature/position.rb +28 -0
  26. data/lib/anystyle/feature/punctuation.rb +22 -0
  27. data/lib/anystyle/feature/quotes.rb +20 -0
  28. data/lib/anystyle/feature/ref.rb +21 -0
  29. data/lib/anystyle/feature/terminal.rb +19 -0
  30. data/lib/anystyle/feature/words.rb +74 -0
  31. data/lib/anystyle/finder.rb +94 -0
  32. data/lib/anystyle/format/bibtex.rb +63 -0
  33. data/lib/anystyle/format/csl.rb +28 -0
  34. data/lib/anystyle/normalizer.rb +65 -0
  35. data/lib/anystyle/normalizer/brackets.rb +13 -0
  36. data/lib/anystyle/normalizer/container.rb +13 -0
  37. data/lib/anystyle/normalizer/date.rb +109 -0
  38. data/lib/anystyle/normalizer/edition.rb +16 -0
  39. data/lib/anystyle/normalizer/journal.rb +14 -0
  40. data/lib/anystyle/normalizer/locale.rb +30 -0
  41. data/lib/anystyle/normalizer/location.rb +24 -0
  42. data/lib/anystyle/normalizer/locator.rb +22 -0
  43. data/lib/anystyle/normalizer/names.rb +88 -0
  44. data/lib/anystyle/normalizer/page.rb +29 -0
  45. data/lib/anystyle/normalizer/publisher.rb +18 -0
  46. data/lib/anystyle/normalizer/pubmed.rb +18 -0
  47. data/lib/anystyle/normalizer/punctuation.rb +23 -0
  48. data/lib/anystyle/normalizer/quotes.rb +14 -0
  49. data/lib/anystyle/normalizer/type.rb +54 -0
  50. data/lib/anystyle/normalizer/volume.rb +26 -0
  51. data/lib/anystyle/parser.rb +199 -0
  52. data/lib/anystyle/support.rb +4 -0
  53. data/lib/anystyle/support/finder.mod +3234 -0
  54. data/lib/anystyle/support/finder.txt +75 -0
  55. data/lib/anystyle/support/parser.mod +15025 -0
  56. data/lib/anystyle/support/parser.txt +75 -0
  57. data/lib/anystyle/utils.rb +70 -0
  58. data/lib/anystyle/version.rb +3 -0
  59. data/res/finder/bb132pr2055.ttx +6803 -0
  60. data/res/finder/bb550sh8053.ttx +18660 -0
  61. data/res/finder/bb599nz4341.ttx +2957 -0
  62. data/res/finder/bb725rt6501.ttx +15276 -0
  63. data/res/finder/bc605xz1554.ttx +18815 -0
  64. data/res/finder/bd040gx5718.ttx +4271 -0
  65. data/res/finder/bd413nt2715.ttx +4956 -0
  66. data/res/finder/bd466fq0394.ttx +6100 -0
  67. data/res/finder/bf668vw2021.ttx +3578 -0
  68. data/res/finder/bg495cx0468.ttx +7267 -0
  69. data/res/finder/bg599vt3743.ttx +6752 -0
  70. data/res/finder/bg608dx2253.ttx +4094 -0
  71. data/res/finder/bh410qk3771.ttx +8785 -0
  72. data/res/finder/bh989ww6442.ttx +17204 -0
  73. data/res/finder/bj581pc8202.ttx +2719 -0
  74. data/res/parser/bad.xml +5199 -0
  75. data/res/parser/core.xml +7924 -0
  76. data/res/parser/gold.xml +2707 -0
  77. data/res/parser/good.xml +34281 -0
  78. data/res/parser/stanford-books.xml +2280 -0
  79. data/res/parser/stanford-diss.xml +726 -0
  80. data/res/parser/stanford-theses.xml +4684 -0
  81. data/res/parser/ugly.xml +33246 -0
  82. metadata +195 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 694288a5b219077818965fffd8eed72aa01c3a096a99647c2ee6014edeb13e38
4
+ data.tar.gz: db7aefb69e10d359aaf55a1da8d0c6041bfefb91088f5e1cc330e6bb3a13e005
5
+ SHA512:
6
+ metadata.gz: 595b8627b93a7e61912fc65f1d1c14a4a0dbd6263d7612c21184aa1e9e42856e16c265a29c6e8b2b3588da1b6a70ca456145ae72c4f4980fea502a7fde314de7
7
+ data.tar.gz: d14ae39fd0e156e76619b8b23e171a8b364fa1d14e00c4f9517edd8bc40d5ff082c83073343fcb345f42a3a5b6828c4358a29593067f35696584d8e868113cfa
@@ -0,0 +1,78 @@
1
+ 0.6.0 / 2014-03-24
2
+ ==================
3
+ * Renamed tech label to genre; institution to authority
4
+ * Updated classifier
5
+ * Updated model
6
+
7
+ 0.5.3 / 2014-03-14
8
+ ==================
9
+ * Added source, director, producer and section labels
10
+ * Updated model
11
+
12
+ 0.5.2 / 2014-03-13
13
+ ==================
14
+ * Add XML output
15
+ * Improve ULR normalizer
16
+
17
+ 0.4.4 & 0.4.5 / 2014-03-10
18
+ ==========================
19
+ * Mitigate potential vulnerability: open files only if string not tainted
20
+
21
+ 0.4.3 / 2014-03-09
22
+ ==================
23
+ * Add model reload method
24
+
25
+ 0.4.2 / 2014-03-08
26
+ ==================
27
+ * Add redis-namespace support
28
+
29
+ 0.4.1 / 2014-03-03
30
+ ==================
31
+ * Normalizer tweaks
32
+
33
+ 0.4.0 / 2014-02-27
34
+ ==================
35
+ * Update wapiti
36
+ * Improve dash patterns
37
+ * Updated default model
38
+
39
+ 0.3.0 / 2014-02-14
40
+ ==================
41
+ * Update dependencies
42
+ * Added raw output format
43
+ * Enforce Ruby 1.9.3 or later requirement
44
+
45
+ 0.2.0 / 2012-10-29
46
+ ==================
47
+ * Use Namae for name normalization
48
+
49
+ 0.1.1 / 2012-03-29
50
+ ==================
51
+ * Bugfix
52
+
53
+ 0.1.0 / 2012-03-03
54
+ ==================
55
+ * Added redis as data store option
56
+
57
+ 0.0.10 / 2012-03-01
58
+ ===================
59
+ * Added new output format: tags (to generate training data)
60
+
61
+ 0.0.9 / 2011-09-08
62
+ ==================
63
+ * Added year-range / page-range feature distinction
64
+ * Improved model and training data
65
+ * Added URL, DOI and ISBN recognition features
66
+ * Improved handling of UTF-8 characters in Regular Expressions
67
+ * Bugfixes
68
+
69
+ 0.0.6 / 2011-09-06
70
+ ==================
71
+ * Added location normalizer
72
+ * Improved punctuation feature elicitation
73
+ * Improved name tokenizing
74
+ * Bugfixes
75
+
76
+ 0.0.1 / 2011-09-05
77
+ ==================
78
+ * Initial release
data/LICENSE ADDED
@@ -0,0 +1,27 @@
1
+ AnyStyle
2
+ Copyright 2011-2018 Sylvester Keil. All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ 1. Redistributions of source code must retain the above copyright notice,
8
+ this list of conditions and the following disclaimer.
9
+
10
+ 2. Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
15
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
16
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
17
+ EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
18
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
22
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23
+ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
+
25
+ The views and conclusions contained in the software and documentation are
26
+ those of the authors and should not be interpreted as representing official
27
+ policies, either expressed or implied, of the copyright holder.
@@ -0,0 +1,103 @@
1
+ AnyStyle
2
+ ========
3
+ [![Build Status](https://travis-ci.org/inukshuk/anystyle.svg?branch=master)](https://travis-ci.org/inukshuk/anystyle)
4
+ [![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/anystyle?branch=master)
5
+
6
+ AnyStyle is a very fast and smart parser for academic references. It
7
+ was originally inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/)
8
+ and [FreeCite](http://freecite.library.brown.edu/); AnyStyle uses machine
9
+ learning algorithms and aims to make it easy to train the model with data
10
+ that is relevant to your parsing needs.
11
+
12
+
13
+ Using AnyStyle CLI
14
+ ------------------
15
+
16
+ $ [sudo] gem install anystyle-cli
17
+ $ anystyle --help
18
+ $ anystyle help find
19
+ $ anystyle help parse
20
+
21
+ See [anystyle-cli](https://github.com/inukshuk/anystyle-cli) for more details.
22
+
23
+
24
+ Web Application and Web Service
25
+ -------------------------------
26
+ AnyStyle is available as a web-application and service at
27
+ [anystyle.io](https://anystyle.io).
28
+
29
+ Please note that the web service is currently based on the legacy
30
+ [0.x branch](https://github.com/inukshuk/anystyle/tree/0.x).
31
+
32
+
33
+ Using AnyStyle in Ruby
34
+ ----------------------
35
+
36
+ $ [sudo] gem install anystyle
37
+
38
+
39
+ Reference Parsing
40
+ -----------------
41
+
42
+ Document Parsing
43
+ ----------------
44
+
45
+ Training
46
+ --------
47
+
48
+ Dictionary Adapters
49
+ -------------------
50
+ During the statistical analysis of reference strings, AnyStyle relies
51
+ on a large feature dictionary; by default, AnyStyle creates a persistent
52
+ Ruby Hash in the folder of the `anystyle-data` Gem. This uses up about
53
+ 2MB of disk space and keeps the entire dictionary in memory. If you prefer
54
+ a smaller memory footprint, you can alternatively use AnyStyle's GDBM
55
+ dictionary. GDBM bindings are part of the Ruby standard library and are
56
+ supported on all platforms, but you may have to install GDBM on your
57
+ platform before installing Ruby.
58
+
59
+ If you do not want to use the the persistent Ruyb Hash nor the GBDM
60
+ bindings, you can store your dictionary in memory (not recommended) or
61
+ use a Redis. The best way to change the default dictionary adapter is by
62
+ adjusting AnyStyle's default configuration (when using the default parser
63
+ instances you must set the default before using the parser):
64
+
65
+ AnyStyle::Dictionary.defaults[:adapter] = :ruby
66
+ #-> Use a persistent Ruby hash;
67
+ #-> slower start-up than GDBM but no extra dependency
68
+
69
+ AnyStyle::Dictionary.defaults[:adapter] = :hash
70
+ #-> Use in-memory dictionary; slow start-up but uses no space on disk
71
+
72
+ require 'anystyle/dictionary/gdbm'
73
+ AnyStyle::Dictionary.defaults[:adapter] = :gdbm
74
+
75
+ To use Redis, install the `redis` and `redis/namespace` (optional) Gems
76
+ and configure AnyStyle to use the Redis adapter:
77
+
78
+ AnyStyle::Dictionary.defaults[:adapter] = :redis
79
+
80
+ # Adjust the Redis-specifi configuration
81
+ require 'anystyle/dictionary/redis'
82
+ AnyStyle::Dictionary::Redis.defaults[:host] = 'localhost'
83
+ AnyStyle::Dictionary::Redis.defaults[:port] = 6379
84
+
85
+ Contributing
86
+ ------------
87
+ The AnyStyle source code is
88
+ [hosted on GitHub](https://github.com/inukshuk/anystyle/).
89
+ You can check out a copy of the latest code using Git:
90
+
91
+ $ git clone https://github.com/inukshuk/anystyle.git
92
+
93
+ If you've found a bug or have a question, please open an issue on the
94
+ [AnyStyle issue tracker](http://github.com/inukshuk/anystyle/issues).
95
+ Or, for extra credit, clone the AnyStyle repository, write a failing
96
+ example, fix the bug and submit a pull request.
97
+
98
+ License
99
+ -------
100
+ Copyright 2011-2018 Sylvester Keil. All rights reserved.
101
+
102
+ AnyStyle is distributed under a BSD-style license.
103
+ See LICENSE for details.
@@ -0,0 +1,71 @@
1
+ require 'forwardable'
2
+ require 'wapiti'
3
+
4
+ require 'anystyle/version'
5
+ require 'anystyle/support'
6
+ require 'anystyle/errors'
7
+ require 'anystyle/utils'
8
+ require 'anystyle/dictionary'
9
+ require 'anystyle/dictionary/marshal'
10
+ require 'anystyle/data'
11
+
12
+ require 'anystyle/feature'
13
+ require 'anystyle/feature/affix'
14
+ require 'anystyle/feature/brackets'
15
+ require 'anystyle/feature/canonical'
16
+ require 'anystyle/feature/caps'
17
+ require 'anystyle/feature/category'
18
+ require 'anystyle/feature/dictionary'
19
+ require 'anystyle/feature/indent'
20
+ require 'anystyle/feature/keyword'
21
+ require 'anystyle/feature/line'
22
+ require 'anystyle/feature/locator'
23
+ require 'anystyle/feature/number'
24
+ require 'anystyle/feature/position'
25
+ require 'anystyle/feature/punctuation'
26
+ require 'anystyle/feature/ref'
27
+ require 'anystyle/feature/terminal'
28
+ require 'anystyle/feature/words'
29
+
30
+ require 'anystyle/normalizer'
31
+ require 'anystyle/normalizer/brackets'
32
+ require 'anystyle/normalizer/container'
33
+ require 'anystyle/normalizer/date'
34
+ require 'anystyle/normalizer/edition'
35
+ require 'anystyle/normalizer/journal'
36
+ require 'anystyle/normalizer/locale'
37
+ require 'anystyle/normalizer/location'
38
+ require 'anystyle/normalizer/locator'
39
+ require 'anystyle/normalizer/names'
40
+ require 'anystyle/normalizer/page'
41
+ require 'anystyle/normalizer/publisher'
42
+ require 'anystyle/normalizer/pubmed'
43
+ require 'anystyle/normalizer/punctuation'
44
+ require 'anystyle/normalizer/quotes'
45
+ require 'anystyle/normalizer/type'
46
+ require 'anystyle/normalizer/volume'
47
+
48
+ require 'anystyle/format/bibtex'
49
+ require 'anystyle/format/csl'
50
+
51
+ require 'anystyle/document'
52
+ require 'anystyle/parser'
53
+ require 'anystyle/finder'
54
+
55
+ module AnyStyle
56
+ def self.parser
57
+ Parser.instance
58
+ end
59
+
60
+ def self.parse(*arguments)
61
+ parser.parse(*arguments)
62
+ end
63
+
64
+ def self.finder
65
+ Finder.instance
66
+ end
67
+
68
+ def self.find(*arguments)
69
+ finder.find(*arguments)
70
+ end
71
+ end
@@ -0,0 +1,132 @@
1
+ module AnyStyle
2
+ class Dictionary
3
+ @tags = [:name, :place, :publisher, :journal]
4
+
5
+ @code = Hash[
6
+ *@tags.zip(0.upto(@tags.length-1).map { |i| 2**i }).flatten
7
+ ]
8
+
9
+ @tags.freeze
10
+ @code.freeze
11
+
12
+ @defaults = {
13
+ adapter: :ruby,
14
+ source: nil
15
+ }
16
+
17
+ class << self
18
+ attr_reader :tags, :code, :defaults, :adapters
19
+
20
+ def create(options = {})
21
+ return options if options.is_a?(Dictionary)
22
+
23
+ options = defaults.merge(options || {})
24
+ adapter = options.delete :adapter
25
+
26
+ case adapter.to_sym
27
+ when :memory, :hash
28
+ new options
29
+ when :gdbm
30
+ require 'anystyle/dictionary/gdbm'
31
+ Dictionary::GDBM.new options
32
+ when :lmdb
33
+ require 'anystyle/dictionary/lmdb'
34
+ Dictionary::LMDB.new options
35
+ when :redis
36
+ require 'anystyle/dictionary/redis'
37
+ Dictionary::Redis.new options
38
+ when :marshal, :ruby
39
+ require 'anystyle/dictionary/marshal'
40
+ Dictionary::Marshal.new options
41
+ else
42
+ raise ArgumentError, "unknown adapter: #{adapter}"
43
+ end
44
+ end
45
+
46
+ def instance
47
+ Thread.current['anystyle_dictionary'] ||= create.open
48
+ end
49
+ end
50
+
51
+ attr_reader :db, :options
52
+
53
+ def initialize(options)
54
+ @options = options
55
+ end
56
+
57
+ def open
58
+ @db = {} unless open?
59
+ self
60
+ ensure
61
+ populate! if empty?
62
+ end
63
+
64
+ def close
65
+ @db = nil
66
+ end
67
+
68
+ def truncate
69
+ close
70
+ end
71
+
72
+ def open?
73
+ not db.nil?
74
+ end
75
+
76
+ def empty?
77
+ db.empty?
78
+ end
79
+
80
+ def get(key)
81
+ db[key.to_s].to_i
82
+ end
83
+
84
+ def put(key, value)
85
+ db[key.to_s] = value.to_i
86
+ end
87
+
88
+ alias_method :[], :get
89
+ alias_method :[]=, :put
90
+
91
+ def tags(key)
92
+ value = get key
93
+
94
+ Dictionary.tags.map { |tag|
95
+ (value & Dictionary.code[tag] > 0) ? 'T' : 'F'
96
+ }
97
+ end
98
+
99
+ def tag_counts(keys)
100
+ counts = Dictionary.tags.map { 0 }
101
+ keys.each do |key|
102
+ value = get(key)
103
+ Dictionary.tags.each.with_index do |tag, idx|
104
+ counts[idx] += 1 if (value & Dictionary.code[tag] > 0)
105
+ end if value > 0
106
+ end
107
+ counts
108
+ end
109
+
110
+ def populate!
111
+ require 'zlib'
112
+
113
+ File.open(options[:source], 'rb') do |file|
114
+ mode = 0
115
+
116
+ Zlib::GzipReader.new(file, encoding: 'UTF-8').each do |line|
117
+ line.strip!
118
+
119
+ case line
120
+ when /^#! (\w+)/i
121
+ mode = Dictionary.code[$1.to_sym]
122
+ when /^#/
123
+ # skip comments
124
+ else
125
+ key = line.split(/\s+(\d+\.\d+)\s*$/)[0]
126
+ put key, get(key) | mode
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,52 @@
1
+ module AnyStyle
2
+ require 'gdbm'
3
+
4
+ class Dictionary
5
+ class GDBM < Dictionary
6
+ @defaults = {
7
+ path: File.expand_path('../../data/dict.db', __FILE__),
8
+ mode: 0666,
9
+ flags: ::GDBM::WRCREAT | ::GDBM::NOLOCK
10
+ }
11
+
12
+ attr_reader :env
13
+
14
+ def initialize(options = {})
15
+ super(self.class.defaults.merge(options))
16
+ end
17
+
18
+ def open
19
+ close
20
+ @db = ::GDBM.new(*options.values_at(:path, :mode, :flags))
21
+ self
22
+ ensure
23
+ populate! if empty?
24
+ end
25
+
26
+ def close
27
+ db.close if open?
28
+ end
29
+
30
+ def open?
31
+ !(db.nil? || db.closed?)
32
+ end
33
+
34
+ def empty?
35
+ open? and db.empty?
36
+ end
37
+
38
+ def truncate
39
+ close
40
+ File.unlink(options[:path])
41
+ end
42
+
43
+ def get(key)
44
+ db[key.to_s].to_i
45
+ end
46
+
47
+ def put(key, value)
48
+ db[key.to_s] = value.to_i.to_s
49
+ end
50
+ end
51
+ end
52
+ end