anystyle 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.md +78 -0
  3. data/LICENSE +27 -0
  4. data/README.md +103 -0
  5. data/lib/anystyle.rb +71 -0
  6. data/lib/anystyle/dictionary.rb +132 -0
  7. data/lib/anystyle/dictionary/gdbm.rb +52 -0
  8. data/lib/anystyle/dictionary/lmdb.rb +67 -0
  9. data/lib/anystyle/dictionary/marshal.rb +27 -0
  10. data/lib/anystyle/dictionary/redis.rb +55 -0
  11. data/lib/anystyle/document.rb +264 -0
  12. data/lib/anystyle/errors.rb +14 -0
  13. data/lib/anystyle/feature.rb +27 -0
  14. data/lib/anystyle/feature/affix.rb +43 -0
  15. data/lib/anystyle/feature/brackets.rb +32 -0
  16. data/lib/anystyle/feature/canonical.rb +13 -0
  17. data/lib/anystyle/feature/caps.rb +20 -0
  18. data/lib/anystyle/feature/category.rb +70 -0
  19. data/lib/anystyle/feature/dictionary.rb +16 -0
  20. data/lib/anystyle/feature/indent.rb +16 -0
  21. data/lib/anystyle/feature/keyword.rb +52 -0
  22. data/lib/anystyle/feature/line.rb +39 -0
  23. data/lib/anystyle/feature/locator.rb +18 -0
  24. data/lib/anystyle/feature/number.rb +39 -0
  25. data/lib/anystyle/feature/position.rb +28 -0
  26. data/lib/anystyle/feature/punctuation.rb +22 -0
  27. data/lib/anystyle/feature/quotes.rb +20 -0
  28. data/lib/anystyle/feature/ref.rb +21 -0
  29. data/lib/anystyle/feature/terminal.rb +19 -0
  30. data/lib/anystyle/feature/words.rb +74 -0
  31. data/lib/anystyle/finder.rb +94 -0
  32. data/lib/anystyle/format/bibtex.rb +63 -0
  33. data/lib/anystyle/format/csl.rb +28 -0
  34. data/lib/anystyle/normalizer.rb +65 -0
  35. data/lib/anystyle/normalizer/brackets.rb +13 -0
  36. data/lib/anystyle/normalizer/container.rb +13 -0
  37. data/lib/anystyle/normalizer/date.rb +109 -0
  38. data/lib/anystyle/normalizer/edition.rb +16 -0
  39. data/lib/anystyle/normalizer/journal.rb +14 -0
  40. data/lib/anystyle/normalizer/locale.rb +30 -0
  41. data/lib/anystyle/normalizer/location.rb +24 -0
  42. data/lib/anystyle/normalizer/locator.rb +22 -0
  43. data/lib/anystyle/normalizer/names.rb +88 -0
  44. data/lib/anystyle/normalizer/page.rb +29 -0
  45. data/lib/anystyle/normalizer/publisher.rb +18 -0
  46. data/lib/anystyle/normalizer/pubmed.rb +18 -0
  47. data/lib/anystyle/normalizer/punctuation.rb +23 -0
  48. data/lib/anystyle/normalizer/quotes.rb +14 -0
  49. data/lib/anystyle/normalizer/type.rb +54 -0
  50. data/lib/anystyle/normalizer/volume.rb +26 -0
  51. data/lib/anystyle/parser.rb +199 -0
  52. data/lib/anystyle/support.rb +4 -0
  53. data/lib/anystyle/support/finder.mod +3234 -0
  54. data/lib/anystyle/support/finder.txt +75 -0
  55. data/lib/anystyle/support/parser.mod +15025 -0
  56. data/lib/anystyle/support/parser.txt +75 -0
  57. data/lib/anystyle/utils.rb +70 -0
  58. data/lib/anystyle/version.rb +3 -0
  59. data/res/finder/bb132pr2055.ttx +6803 -0
  60. data/res/finder/bb550sh8053.ttx +18660 -0
  61. data/res/finder/bb599nz4341.ttx +2957 -0
  62. data/res/finder/bb725rt6501.ttx +15276 -0
  63. data/res/finder/bc605xz1554.ttx +18815 -0
  64. data/res/finder/bd040gx5718.ttx +4271 -0
  65. data/res/finder/bd413nt2715.ttx +4956 -0
  66. data/res/finder/bd466fq0394.ttx +6100 -0
  67. data/res/finder/bf668vw2021.ttx +3578 -0
  68. data/res/finder/bg495cx0468.ttx +7267 -0
  69. data/res/finder/bg599vt3743.ttx +6752 -0
  70. data/res/finder/bg608dx2253.ttx +4094 -0
  71. data/res/finder/bh410qk3771.ttx +8785 -0
  72. data/res/finder/bh989ww6442.ttx +17204 -0
  73. data/res/finder/bj581pc8202.ttx +2719 -0
  74. data/res/parser/bad.xml +5199 -0
  75. data/res/parser/core.xml +7924 -0
  76. data/res/parser/gold.xml +2707 -0
  77. data/res/parser/good.xml +34281 -0
  78. data/res/parser/stanford-books.xml +2280 -0
  79. data/res/parser/stanford-diss.xml +726 -0
  80. data/res/parser/stanford-theses.xml +4684 -0
  81. data/res/parser/ugly.xml +33246 -0
  82. metadata +195 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 694288a5b219077818965fffd8eed72aa01c3a096a99647c2ee6014edeb13e38
4
+ data.tar.gz: db7aefb69e10d359aaf55a1da8d0c6041bfefb91088f5e1cc330e6bb3a13e005
5
+ SHA512:
6
+ metadata.gz: 595b8627b93a7e61912fc65f1d1c14a4a0dbd6263d7612c21184aa1e9e42856e16c265a29c6e8b2b3588da1b6a70ca456145ae72c4f4980fea502a7fde314de7
7
+ data.tar.gz: d14ae39fd0e156e76619b8b23e171a8b364fa1d14e00c4f9517edd8bc40d5ff082c83073343fcb345f42a3a5b6828c4358a29593067f35696584d8e868113cfa
@@ -0,0 +1,78 @@
1
+ 0.6.0 / 2014-03-24
2
+ ==================
3
+ * Renamed tech label to genre; institution to authority
4
+ * Updated classifier
5
+ * Updated model
6
+
7
+ 0.5.3 / 2014-03-14
8
+ ==================
9
+ * Added source, director, producer and section labels
10
+ * Updated model
11
+
12
+ 0.5.2 / 2014-03-13
13
+ ==================
14
+ * Add XML output
15
+ * Improve ULR normalizer
16
+
17
+ 0.4.4 & 0.4.5 / 2014-03-10
18
+ ==========================
19
+ * Mitigate potential vulnerability: open files only if string not tainted
20
+
21
+ 0.4.3 / 2014-03-09
22
+ ==================
23
+ * Add model reload method
24
+
25
+ 0.4.2 / 2014-03-08
26
+ ==================
27
+ * Add redis-namespace support
28
+
29
+ 0.4.1 / 2014-03-03
30
+ ==================
31
+ * Normalizer tweaks
32
+
33
+ 0.4.0 / 2014-02-27
34
+ ==================
35
+ * Update wapiti
36
+ * Improve dash patterns
37
+ * Updated default model
38
+
39
+ 0.3.0 / 2014-02-14
40
+ ==================
41
+ * Update dependencies
42
+ * Added raw output format
43
+ * Enforce Ruby 1.9.3 or later requirement
44
+
45
+ 0.2.0 / 2012-10-29
46
+ ==================
47
+ * Use Namae for name normalization
48
+
49
+ 0.1.1 / 2012-03-29
50
+ ==================
51
+ * Bugfix
52
+
53
+ 0.1.0 / 2012-03-03
54
+ ==================
55
+ * Added redis as data store option
56
+
57
+ 0.0.10 / 2012-03-01
58
+ ===================
59
+ * Added new output format: tags (to generate training data)
60
+
61
+ 0.0.9 / 2011-09-08
62
+ ==================
63
+ * Added year-range / page-range feature distinction
64
+ * Improved model and training data
65
+ * Added URL, DOI and ISBN recognition features
66
+ * Improved handling of UTF-8 characters in Regular Expressions
67
+ * Bugfixes
68
+
69
+ 0.0.6 / 2011-09-06
70
+ ==================
71
+ * Added location normalizer
72
+ * Improved punctuation feature elicitation
73
+ * Improved name tokenizing
74
+ * Bugfixes
75
+
76
+ 0.0.1 / 2011-09-05
77
+ ==================
78
+ * Initial release
data/LICENSE ADDED
@@ -0,0 +1,27 @@
1
+ AnyStyle
2
+ Copyright 2011-2018 Sylvester Keil. All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ 1. Redistributions of source code must retain the above copyright notice,
8
+ this list of conditions and the following disclaimer.
9
+
10
+ 2. Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
15
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
16
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
17
+ EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
18
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
22
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23
+ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
+
25
+ The views and conclusions contained in the software and documentation are
26
+ those of the authors and should not be interpreted as representing official
27
+ policies, either expressed or implied, of the copyright holder.
@@ -0,0 +1,103 @@
1
+ AnyStyle
2
+ ========
3
+ [![Build Status](https://travis-ci.org/inukshuk/anystyle.svg?branch=master)](https://travis-ci.org/inukshuk/anystyle)
4
+ [![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/anystyle?branch=master)
5
+
6
+ AnyStyle is a very fast and smart parser for academic references. It
7
+ was originally inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/)
8
+ and [FreeCite](http://freecite.library.brown.edu/); AnyStyle uses machine
9
+ learning algorithms and aims to make it easy to train the model with data
10
+ that is relevant to your parsing needs.
11
+
12
+
13
+ Using AnyStyle CLI
14
+ ------------------
15
+
16
+ $ [sudo] gem install anystyle-cli
17
+ $ anystyle --help
18
+ $ anystyle help find
19
+ $ anystyle help parse
20
+
21
+ See [anystyle-cli](https://github.com/inukshuk/anystyle-cli) for more details.
22
+
23
+
24
+ Web Application and Web Service
25
+ -------------------------------
26
+ AnyStyle is available as a web-application and service at
27
+ [anystyle.io](https://anystyle.io).
28
+
29
+ Please note that the web service is currently based on the legacy
30
+ [0.x branch](https://github.com/inukshuk/anystyle/tree/0.x).
31
+
32
+
33
+ Using AnyStyle in Ruby
34
+ ----------------------
35
+
36
+ $ [sudo] gem install anystyle
37
+
38
+
39
+ Reference Parsing
40
+ -----------------
41
+
42
+ Document Parsing
43
+ ----------------
44
+
45
+ Training
46
+ --------
47
+
48
+ Dictionary Adapters
49
+ -------------------
50
+ During the statistical analysis of reference strings, AnyStyle relies
51
+ on a large feature dictionary; by default, AnyStyle creates a persistent
52
+ Ruby Hash in the folder of the `anystyle-data` Gem. This uses up about
53
+ 2MB of disk space and keeps the entire dictionary in memory. If you prefer
54
+ a smaller memory footprint, you can alternatively use AnyStyle's GDBM
55
+ dictionary. GDBM bindings are part of the Ruby standard library and are
56
+ supported on all platforms, but you may have to install GDBM on your
57
+ platform before installing Ruby.
58
+
59
+ If you do not want to use the the persistent Ruyb Hash nor the GBDM
60
+ bindings, you can store your dictionary in memory (not recommended) or
61
+ use a Redis. The best way to change the default dictionary adapter is by
62
+ adjusting AnyStyle's default configuration (when using the default parser
63
+ instances you must set the default before using the parser):
64
+
65
+ AnyStyle::Dictionary.defaults[:adapter] = :ruby
66
+ #-> Use a persistent Ruby hash;
67
+ #-> slower start-up than GDBM but no extra dependency
68
+
69
+ AnyStyle::Dictionary.defaults[:adapter] = :hash
70
+ #-> Use in-memory dictionary; slow start-up but uses no space on disk
71
+
72
+ require 'anystyle/dictionary/gdbm'
73
+ AnyStyle::Dictionary.defaults[:adapter] = :gdbm
74
+
75
+ To use Redis, install the `redis` and `redis/namespace` (optional) Gems
76
+ and configure AnyStyle to use the Redis adapter:
77
+
78
+ AnyStyle::Dictionary.defaults[:adapter] = :redis
79
+
80
+ # Adjust the Redis-specifi configuration
81
+ require 'anystyle/dictionary/redis'
82
+ AnyStyle::Dictionary::Redis.defaults[:host] = 'localhost'
83
+ AnyStyle::Dictionary::Redis.defaults[:port] = 6379
84
+
85
+ Contributing
86
+ ------------
87
+ The AnyStyle source code is
88
+ [hosted on GitHub](https://github.com/inukshuk/anystyle/).
89
+ You can check out a copy of the latest code using Git:
90
+
91
+ $ git clone https://github.com/inukshuk/anystyle.git
92
+
93
+ If you've found a bug or have a question, please open an issue on the
94
+ [AnyStyle issue tracker](http://github.com/inukshuk/anystyle/issues).
95
+ Or, for extra credit, clone the AnyStyle repository, write a failing
96
+ example, fix the bug and submit a pull request.
97
+
98
+ License
99
+ -------
100
+ Copyright 2011-2018 Sylvester Keil. All rights reserved.
101
+
102
+ AnyStyle is distributed under a BSD-style license.
103
+ See LICENSE for details.
@@ -0,0 +1,71 @@
1
+ require 'forwardable'
2
+ require 'wapiti'
3
+
4
+ require 'anystyle/version'
5
+ require 'anystyle/support'
6
+ require 'anystyle/errors'
7
+ require 'anystyle/utils'
8
+ require 'anystyle/dictionary'
9
+ require 'anystyle/dictionary/marshal'
10
+ require 'anystyle/data'
11
+
12
+ require 'anystyle/feature'
13
+ require 'anystyle/feature/affix'
14
+ require 'anystyle/feature/brackets'
15
+ require 'anystyle/feature/canonical'
16
+ require 'anystyle/feature/caps'
17
+ require 'anystyle/feature/category'
18
+ require 'anystyle/feature/dictionary'
19
+ require 'anystyle/feature/indent'
20
+ require 'anystyle/feature/keyword'
21
+ require 'anystyle/feature/line'
22
+ require 'anystyle/feature/locator'
23
+ require 'anystyle/feature/number'
24
+ require 'anystyle/feature/position'
25
+ require 'anystyle/feature/punctuation'
26
+ require 'anystyle/feature/ref'
27
+ require 'anystyle/feature/terminal'
28
+ require 'anystyle/feature/words'
29
+
30
+ require 'anystyle/normalizer'
31
+ require 'anystyle/normalizer/brackets'
32
+ require 'anystyle/normalizer/container'
33
+ require 'anystyle/normalizer/date'
34
+ require 'anystyle/normalizer/edition'
35
+ require 'anystyle/normalizer/journal'
36
+ require 'anystyle/normalizer/locale'
37
+ require 'anystyle/normalizer/location'
38
+ require 'anystyle/normalizer/locator'
39
+ require 'anystyle/normalizer/names'
40
+ require 'anystyle/normalizer/page'
41
+ require 'anystyle/normalizer/publisher'
42
+ require 'anystyle/normalizer/pubmed'
43
+ require 'anystyle/normalizer/punctuation'
44
+ require 'anystyle/normalizer/quotes'
45
+ require 'anystyle/normalizer/type'
46
+ require 'anystyle/normalizer/volume'
47
+
48
+ require 'anystyle/format/bibtex'
49
+ require 'anystyle/format/csl'
50
+
51
+ require 'anystyle/document'
52
+ require 'anystyle/parser'
53
+ require 'anystyle/finder'
54
+
55
+ module AnyStyle
56
+ def self.parser
57
+ Parser.instance
58
+ end
59
+
60
+ def self.parse(*arguments)
61
+ parser.parse(*arguments)
62
+ end
63
+
64
+ def self.finder
65
+ Finder.instance
66
+ end
67
+
68
+ def self.find(*arguments)
69
+ finder.find(*arguments)
70
+ end
71
+ end
@@ -0,0 +1,132 @@
1
+ module AnyStyle
2
+ class Dictionary
3
+ @tags = [:name, :place, :publisher, :journal]
4
+
5
+ @code = Hash[
6
+ *@tags.zip(0.upto(@tags.length-1).map { |i| 2**i }).flatten
7
+ ]
8
+
9
+ @tags.freeze
10
+ @code.freeze
11
+
12
+ @defaults = {
13
+ adapter: :ruby,
14
+ source: nil
15
+ }
16
+
17
+ class << self
18
+ attr_reader :tags, :code, :defaults, :adapters
19
+
20
+ def create(options = {})
21
+ return options if options.is_a?(Dictionary)
22
+
23
+ options = defaults.merge(options || {})
24
+ adapter = options.delete :adapter
25
+
26
+ case adapter.to_sym
27
+ when :memory, :hash
28
+ new options
29
+ when :gdbm
30
+ require 'anystyle/dictionary/gdbm'
31
+ Dictionary::GDBM.new options
32
+ when :lmdb
33
+ require 'anystyle/dictionary/lmdb'
34
+ Dictionary::LMDB.new options
35
+ when :redis
36
+ require 'anystyle/dictionary/redis'
37
+ Dictionary::Redis.new options
38
+ when :marshal, :ruby
39
+ require 'anystyle/dictionary/marshal'
40
+ Dictionary::Marshal.new options
41
+ else
42
+ raise ArgumentError, "unknown adapter: #{adapter}"
43
+ end
44
+ end
45
+
46
+ def instance
47
+ Thread.current['anystyle_dictionary'] ||= create.open
48
+ end
49
+ end
50
+
51
+ attr_reader :db, :options
52
+
53
+ def initialize(options)
54
+ @options = options
55
+ end
56
+
57
+ def open
58
+ @db = {} unless open?
59
+ self
60
+ ensure
61
+ populate! if empty?
62
+ end
63
+
64
+ def close
65
+ @db = nil
66
+ end
67
+
68
+ def truncate
69
+ close
70
+ end
71
+
72
+ def open?
73
+ not db.nil?
74
+ end
75
+
76
+ def empty?
77
+ db.empty?
78
+ end
79
+
80
+ def get(key)
81
+ db[key.to_s].to_i
82
+ end
83
+
84
+ def put(key, value)
85
+ db[key.to_s] = value.to_i
86
+ end
87
+
88
+ alias_method :[], :get
89
+ alias_method :[]=, :put
90
+
91
+ def tags(key)
92
+ value = get key
93
+
94
+ Dictionary.tags.map { |tag|
95
+ (value & Dictionary.code[tag] > 0) ? 'T' : 'F'
96
+ }
97
+ end
98
+
99
+ def tag_counts(keys)
100
+ counts = Dictionary.tags.map { 0 }
101
+ keys.each do |key|
102
+ value = get(key)
103
+ Dictionary.tags.each.with_index do |tag, idx|
104
+ counts[idx] += 1 if (value & Dictionary.code[tag] > 0)
105
+ end if value > 0
106
+ end
107
+ counts
108
+ end
109
+
110
+ def populate!
111
+ require 'zlib'
112
+
113
+ File.open(options[:source], 'rb') do |file|
114
+ mode = 0
115
+
116
+ Zlib::GzipReader.new(file, encoding: 'UTF-8').each do |line|
117
+ line.strip!
118
+
119
+ case line
120
+ when /^#! (\w+)/i
121
+ mode = Dictionary.code[$1.to_sym]
122
+ when /^#/
123
+ # skip comments
124
+ else
125
+ key = line.split(/\s+(\d+\.\d+)\s*$/)[0]
126
+ put key, get(key) | mode
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,52 @@
1
+ module AnyStyle
2
+ require 'gdbm'
3
+
4
+ class Dictionary
5
+ class GDBM < Dictionary
6
+ @defaults = {
7
+ path: File.expand_path('../../data/dict.db', __FILE__),
8
+ mode: 0666,
9
+ flags: ::GDBM::WRCREAT | ::GDBM::NOLOCK
10
+ }
11
+
12
+ attr_reader :env
13
+
14
+ def initialize(options = {})
15
+ super(self.class.defaults.merge(options))
16
+ end
17
+
18
+ def open
19
+ close
20
+ @db = ::GDBM.new(*options.values_at(:path, :mode, :flags))
21
+ self
22
+ ensure
23
+ populate! if empty?
24
+ end
25
+
26
+ def close
27
+ db.close if open?
28
+ end
29
+
30
+ def open?
31
+ !(db.nil? || db.closed?)
32
+ end
33
+
34
+ def empty?
35
+ open? and db.empty?
36
+ end
37
+
38
+ def truncate
39
+ close
40
+ File.unlink(options[:path])
41
+ end
42
+
43
+ def get(key)
44
+ db[key.to_s].to_i
45
+ end
46
+
47
+ def put(key, value)
48
+ db[key.to_s] = value.to_i.to_s
49
+ end
50
+ end
51
+ end
52
+ end