anystyle-cli 1.0.1 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: db54794068efeafdbcb793c71ffeff40fa3e67f5ac7e4564eb76c2cddc23ec09
4
- data.tar.gz: 4db1f6bf5a8529cd9dbf736745401a83ef207642c595c16857d903468488ddb6
3
+ metadata.gz: 25fa9822dfe3ab9862a84f5c31ae70df159643a3ecc40c4958ef0ed77b17da54
4
+ data.tar.gz: cf7587eec67a7372df802f4a7ee63e4ef1bc915070137c56d424cb2fcb1e813d
5
5
  SHA512:
6
- metadata.gz: bda0fd4dd07c77a189a8964e4ec150d63a2ff80b324f5ea3e28188e6c1ce2947ca007df3fc5142dc84bc05b17a0cb0cfc960908c78532ec8723f940521ee900a
7
- data.tar.gz: baecc0a27805e48906a23f9f08ddf1af1e22c83015993cef4624151778a5bdee3ad80bc50b6437d65f5a753c0f62f229463d8a8c162ff0faec44f4d6f206dbe9
6
+ metadata.gz: f1a0b7ec3479c88d429fc1b58855473fefc0cdfac9171a72c2124e3fb2e75b3503caf9963d072c74da5a55673af69ca1db9cc751dcf1ece6ae58d997a5837700
7
+ data.tar.gz: c8c410d03fe9dc43a1667d77f0931fd96d4f004ab9a1d45b21bf4eacad68054f3a301ea0241a42a42affd77dda7fbef24f042e1d3a4e4dc702da4b261a306dd5
data/README.md CHANGED
@@ -10,22 +10,28 @@ anystyle --help
10
10
  anystyle [global options] command [command options] [arguments...]
11
11
 
12
12
  VERSION
13
- 1.0.0 (cli 1.0.0, data 1.2.0)
13
+ 1.1.0 (cli 1.0.2, data 1.2.0)
14
14
 
15
15
  GLOBAL OPTIONS
16
- --adapter=name - Set the dictionary adapter (default: ruby)
17
- -f, --format=name - Set the output format (default: ["json"])
18
- --help - Show this message
19
- --[no-]stdout - Print results directly to stdout
20
- --[no-]verbose - Print status messages to stderr
21
- --version - Display the program version
22
- -w, --[no-]overwrite - Allow overwriting existing files
16
+ -F, --finder-model=file - Set the finder model file (default: none)
17
+ -P, --parser-model=file - Set the parser model file (default: none)
18
+ --adapter=name - Set the dictionary adapter (default: ruby)
19
+ -f, --format=name - Set the output format (default: ["json"])
20
+ --pdfinfo=path - Set the path for pdfinfo (default: none)
21
+ --pdftotext=path - Set the path for pdftotext (default: none)
22
+ --help - Show this message
23
+ --[no-]stdout - Print results directly to stdout
24
+ --[no-]verbose - Print status messages to stderr
25
+ --version - Display the program version
26
+ -w, --[no-]overwrite - Allow overwriting existing files
23
27
 
24
28
  COMMANDS
29
+ check - Check tagged documents or references
25
30
  find - Find and extract references from text documents
26
31
  help - Shows a list of commands or help for one command
27
32
  license - Print license information
28
33
  parse - Parse and convert references
34
+ train - Create a new finder or parser model
29
35
 
30
36
  anystyle help find
31
37
  ------------------
@@ -57,7 +63,6 @@ anystyle help find
57
63
 
58
64
  Anlyzing PDF documents currently depends on `pdftotext' which must be
59
65
  installed separately.
60
-
61
66
  EXAMPLES
62
67
  anystyle -f csl,xml find thesis.pdf
63
68
 
@@ -70,6 +75,24 @@ anystyle help find
70
75
  if your document uses a multi-column layout) and save them in BibTeX in
71
76
  `./bib/thesis.bib'.
72
77
 
78
+ anystyle find --crop 72 thesis.pdf -
79
+
80
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
81
+ each page border and print the results to STDOUT.
82
+
83
+ anystyle find --crop 72,28 thesis.pdf -
84
+
85
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
86
+ each page's left and right border, approx. 1cm (28pt) from the top
87
+ and bottom.
88
+
89
+
90
+ COMMAND OPTIONS
91
+ -C, --crop=pt - Set cropping boundary for text extraction (default: none)
92
+ --[no-]layout - Use layout mode for PDF text extraction (default: enabled)
93
+ --[no-]solo - Include references outside of reference sections
94
+
95
+
73
96
  anystyle help parse
74
97
  -------------------
75
98
  COMMAND OPTIONS
@@ -111,6 +134,55 @@ anystyle help parse
111
134
 
112
135
  Extract references from `input.txt' and print them to STDOUT in CSL/JSON.
113
136
 
137
+
138
+ anystyle help check
139
+ -------------------
140
+ NAME
141
+ check - Check tagged documents or references
142
+
143
+ SYNOPSIS
144
+ anystyle [global options] check input
145
+
146
+ DESCRIPTION
147
+ This manual page documents the AnyStyle `check' command. AnyStyle `check'
148
+ analyzes tagged text documents or references.
149
+
150
+ The input argument can be a single TTX or XML document, or a folder
151
+ containing multiple documents.
152
+
153
+ AnyStyle `check' supports the following input formats:
154
+ ttx Tagged document format, used for training the finder model;
155
+ xml References only, XML, suitable for training the parser model.
156
+
157
+ EXAMPLES
158
+ anystyle check training-data.xml
159
+
160
+ Checks all references in the XML file and prints a report to STDOUT.
161
+
162
+
163
+ anystyle help train
164
+ -------------------
165
+ NAME
166
+ train - Create a new finder or parser model
167
+
168
+ SYNOPSIS
169
+ anystyle [global options] train input [output]
170
+
171
+ DESCRIPTION
172
+ This manual page documents the AnyStyle `train' command. AnyStyle `train'
173
+ creates a new finder or parser model based on the supplied training sets.
174
+
175
+ The input argument can be a XML document, or a folder containing multiple
176
+ TTX documents.
177
+
178
+ EXAMPLES
179
+ anystyle train data.xml my-model.mod
180
+
181
+ Creates a new parser model based on the XML training set and saves it
182
+ as `my-model.mod'. To use your model use the global `--finder-model'
183
+ or `--parser-model' flags.
184
+
185
+
114
186
  License
115
187
  -------
116
188
  Copyright 2011-2018 Sylvester Keil. All rights reserved.
@@ -30,6 +30,23 @@ switch ['w', 'overwrite'],
30
30
  switch 'stdout',
31
31
  desc: 'Print results directly to stdout'
32
32
 
33
+ flag ['F', 'finder-model'],
34
+ arg_name: 'file',
35
+ desc: 'Set the finder model file'
36
+
37
+ flag ['P', 'parser-model'],
38
+ arg_name: 'file',
39
+ desc: 'Set the parser model file'
40
+
41
+ flag 'pdftotext',
42
+ arg_name: 'path',
43
+ desc: 'Set the path for pdftotext'
44
+
45
+ flag 'pdfinfo',
46
+ arg_name: 'path',
47
+ desc: 'Set the path for pdfinfo'
48
+
49
+
33
50
  flag 'adapter',
34
51
  default_value: 'ruby',
35
52
  arg_name: 'name',
@@ -46,6 +63,28 @@ flag ['f', 'format'],
46
63
 
47
64
  pre do |opts|
48
65
  AnyStyle::Dictionary.defaults[:adapter] = opts[:adapter]
66
+
67
+ unless opts[:'finder-model'].nil?
68
+ AnyStyle::Finder.defaults[:model] =
69
+ File.expand_path(opts[:'finder-model']).untaint
70
+ end
71
+
72
+ unless opts[:'parser-model'].nil?
73
+ AnyStyle::Parser.defaults[:model] =
74
+ File.expand_path(opts[:'parser-model']).untaint
75
+ end
76
+
77
+ unless opts[:pdftotext].nil?
78
+ AnyStyle::Finder.defaults[:pdftotext] =
79
+ opts[:pdftotext].untaint
80
+ end
81
+
82
+ unless opts[:pdfinfo].nil?
83
+ AnyStyle::Finder.defaults[:pdfinfo] =
84
+ opts[:pdfinfo].untaint
85
+ end
86
+
87
+ AnyStyle
49
88
  end
50
89
 
51
90
 
@@ -84,6 +123,17 @@ EXAMPLES
84
123
  Extract references from `thesis.pdf' in `no-layout' mode (e.g., use this
85
124
  if your document uses a multi-column layout) and save them in BibTeX in
86
125
  `./bib/thesis.bib'.
126
+
127
+ anystyle find --crop 72 thesis.pdf -
128
+
129
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
130
+ each page border and print the results to STDOUT.
131
+
132
+ anystyle find --crop 72,28 thesis.pdf -
133
+
134
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
135
+ each page's left and right border, approx. 1cm (28pt) from the top
136
+ and bottom.
87
137
  }.lstrip
88
138
 
89
139
  arg :input
@@ -93,6 +143,16 @@ command :find do |cmd|
93
143
  default_value: true,
94
144
  desc: 'Use layout mode for PDF text extraction'
95
145
 
146
+ cmd.switch 'solo',
147
+ default_value: false,
148
+ desc: 'Include references outside of reference sections'
149
+
150
+ cmd.flag ['C', 'crop'],
151
+ arg_name: 'pt',
152
+ type: Array,
153
+ must_match: /\d+(,\d+)?|\d+,\d+(,-?\d+){2}/,
154
+ desc: 'Set cropping boundary for text extraction'
155
+
96
156
  cmd.action do |opts, params, args|
97
157
  Commands::Find.new(opts).run(args, params)
98
158
  end
@@ -140,6 +200,56 @@ command :parse do |cmd|
140
200
  end
141
201
  end
142
202
 
203
+ desc 'Check tagged documents or references'
204
+ long_desc %{
205
+ This manual page documents the AnyStyle `check' command. AnyStyle `check'
206
+ analyzes tagged text documents or references.
207
+
208
+ The input argument can be a single TTX or XML document, or a folder
209
+ containing multiple documents.
210
+
211
+ AnyStyle `check' supports the following input formats:
212
+ ttx Tagged document format, used for training the finder model;
213
+ xml References only, XML, suitable for training the parser model.
214
+
215
+ EXAMPLES
216
+ anystyle check training-data.xml
217
+
218
+ Checks all references in the XML file and prints a report to STDOUT.
219
+ }.lstrip
220
+
221
+ arg :input
222
+ command :check do |cmd|
223
+ cmd.action do |opts, params, args|
224
+ Commands::Check.new(opts).run(args, params)
225
+ end
226
+ end
227
+
228
+
229
+ desc 'Create a new finder or parser model'
230
+ long_desc %{
231
+ This manual page documents the AnyStyle `train' command. AnyStyle `train'
232
+ creates a new finder or parser model based on the supplied training sets.
233
+
234
+ The input argument can be a XML document, or a folder containing multiple
235
+ TTX documents.
236
+
237
+ EXAMPLES
238
+ anystyle train data.xml my-model.mod
239
+
240
+ Creates a new parser model based on the XML training set and saves it
241
+ as `my-model.mod'. To use your model use the global `--finder-model'
242
+ or `--parser-model' flags.
243
+ }.lstrip
244
+
245
+ arg :input
246
+ arg :output, :optional
247
+ command :train do |cmd|
248
+ cmd.action do |opts, params, args|
249
+ Commands::Train.new(opts).run(args, params)
250
+ end
251
+ end
252
+
143
253
  desc 'Print license information'
144
254
  command :license do |cmd|
145
255
  cmd.action do
@@ -4,5 +4,7 @@ require 'anystyle'
4
4
 
5
5
  require 'anystyle/cli/version'
6
6
  require 'anystyle/cli/commands/base'
7
+ require 'anystyle/cli/commands/check'
7
8
  require 'anystyle/cli/commands/find'
8
9
  require 'anystyle/cli/commands/parse'
10
+ require 'anystyle/cli/commands/train'
@@ -28,8 +28,11 @@ module AnyStyle
28
28
  options[:format].each(&block)
29
29
  end
30
30
 
31
- def find(input, **opts)
32
- AnyStyle.find(input, format: :wapiti, **opts)
31
+ def find(input, opts = {})
32
+ AnyStyle.find(input,
33
+ format: :wapiti,
34
+ layout: opts[:layout],
35
+ crop: opts[:crop].nil? ? nil : opts[:crop].map(&:to_i))
33
36
  end
34
37
 
35
38
  def parse(input)
@@ -67,7 +70,12 @@ module AnyStyle
67
70
  end
68
71
 
69
72
  def set_output_folder(path)
70
- @output_folder = Pathname.new(path).expand_path unless path.nil?
73
+ case path
74
+ when nil, '-'
75
+ options[:stdout] = true
76
+ else
77
+ @output_folder = Pathname.new(path).expand_path
78
+ end
71
79
  ensure
72
80
  unless @output_folder.nil?
73
81
  if @output_folder.exist?
@@ -83,16 +91,32 @@ module AnyStyle
83
91
  STDERR.puts(*args) if verbose?
84
92
  end
85
93
 
94
+ def report(error, file)
95
+ STDERR.puts "Error processing `#{file}'"
96
+ STDERR.puts " #{error.message}"
97
+ STDERR.puts " #{error.backtrace[0]}"
98
+ STDERR.puts " #{error.backtrace[1]}"
99
+ STDERR.puts " ..."
100
+ end
101
+
86
102
  def walk(input)
87
103
  path = Pathname(input).expand_path
88
104
  raise ArgumentError, "path does not exist: #{input}" unless path.exist?
89
105
 
90
106
  if path.directory?
91
107
  path.each_child do |file|
92
- yield file, path unless file.directory?
108
+ begin
109
+ yield file, path unless file.directory?
110
+ rescue => e
111
+ report e, file.relative_path_from(path)
112
+ end
93
113
  end
94
114
  else
95
- yield path, path.dirname
115
+ begin
116
+ yield path, path.dirname
117
+ rescue => e
118
+ report e, path.basename
119
+ end
96
120
  end
97
121
  end
98
122
 
@@ -0,0 +1,41 @@
1
+ module AnyStyle
2
+ module CLI
3
+ module Commands
4
+ class Check < Base
5
+ def run(args, params)
6
+ walk args[0] do |path|
7
+ print 'Checking %.25s' % "#{path.basename}....................."
8
+ start = Time.now
9
+ stats = check path
10
+ report stats, Time.now - start
11
+ end
12
+ end
13
+
14
+ def check(path)
15
+ case path.extname
16
+ when '.ttx'
17
+ AnyStyle.finder.check path.to_s.untaint
18
+ when '.xml'
19
+ AnyStyle.parser.check path.to_s.untaint
20
+ else
21
+ raise ArgumentError, "cannot check untagged input: #{path}"
22
+ end
23
+ end
24
+
25
+ def report(stats, time)
26
+ if stats[:token][:errors] == 0
27
+ puts ' ✓ %2ds' % time
28
+ else
29
+ puts '%4d seq %6.2f%% %6d tok %5.2f%% %2ds' % [
30
+ stats[:sequence][:errors],
31
+ stats[:sequence][:rate],
32
+ stats[:token][:errors],
33
+ stats[:token][:rate],
34
+ time
35
+ ]
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -6,8 +6,8 @@ module AnyStyle
6
6
  set_output_folder args[1]
7
7
  walk args[0] do |path, base_path|
8
8
  say "Analyzing #{path.relative_path_from(base_path)} ..."
9
- doc = find(path.to_s.untaint, layout: params[:layout])
10
- ref = doc[0].references
9
+ doc = find(path.to_s.untaint, params)
10
+ ref = doc[0].references(normalize_blocks: !params[:solo])
11
11
 
12
12
  if ref.length == 0
13
13
  say "no references found."
@@ -17,9 +17,9 @@ module AnyStyle
17
17
  each_format do |fmt|
18
18
  case fmt
19
19
  when 'ttx'
20
- res = doc.to_txt tagged: true
20
+ res = doc.to_s tagged: true
21
21
  when 'txt'
22
- res = doc.to_txt tagged: false
22
+ res = doc.to_s tagged: false
23
23
  when 'ref'
24
24
  res = ref.join("\n")
25
25
  else
@@ -0,0 +1,41 @@
1
+ module AnyStyle
2
+ module CLI
3
+ module Commands
4
+ class Train < Base
5
+ def run(args, params)
6
+ check_no_overwrite! args[1]
7
+
8
+ Wapiti.debug!
9
+ model = train(args[0])
10
+
11
+ if args[1].nil?
12
+ model.save
13
+ else
14
+ model.save File.expand_path(args[1]).untaint
15
+ end
16
+ end
17
+
18
+ def train(path)
19
+ case
20
+ when File.extname(path) == '.xml'
21
+ AnyStyle.parser.train path.to_s.untaint
22
+ AnyStyle.parser.model
23
+ when File.directory?(path)
24
+ AnyStyle.finder.train Dir[File.join(path, '*.ttx')].map(&:untaint)
25
+ AnyStyle.finder.model
26
+ else
27
+ raise ArgumentError, "cannot train input: #{path}"
28
+ end
29
+ end
30
+
31
+ def check_no_overwrite!(path)
32
+ if !overwrite? && (path.nil? || File.exist?(path))
33
+ raise RuntimeError,
34
+ "file exists, use --overwrite to force saving: #{path}"
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+
@@ -1,5 +1,5 @@
1
1
  module AnyStyle
2
2
  module CLI
3
- VERSION = '1.0.1'.freeze
3
+ VERSION = '1.3.1'.freeze
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sylvester Keil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-06-06 00:00:00.000000000 Z
11
+ date: 2020-05-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: anystyle
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '1.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bibtex-ruby
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '4.4'
33
+ version: '5.1'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '4.4'
40
+ version: '5.1'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: gli
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -65,8 +65,10 @@ files:
65
65
  - bin/anystyle
66
66
  - lib/anystyle/cli.rb
67
67
  - lib/anystyle/cli/commands/base.rb
68
+ - lib/anystyle/cli/commands/check.rb
68
69
  - lib/anystyle/cli/commands/find.rb
69
70
  - lib/anystyle/cli/commands/parse.rb
71
+ - lib/anystyle/cli/commands/train.rb
70
72
  - lib/anystyle/cli/version.rb
71
73
  homepage: http://anystyle.io
72
74
  licenses:
@@ -87,8 +89,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
89
  - !ruby/object:Gem::Version
88
90
  version: '0'
89
91
  requirements: []
90
- rubyforge_project:
91
- rubygems_version: 2.7.4
92
+ rubygems_version: 3.1.2
92
93
  signing_key:
93
94
  specification_version: 4
94
95
  summary: AnyStyle CLI