anystyle-cli 1.0.1 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: db54794068efeafdbcb793c71ffeff40fa3e67f5ac7e4564eb76c2cddc23ec09
4
- data.tar.gz: 4db1f6bf5a8529cd9dbf736745401a83ef207642c595c16857d903468488ddb6
3
+ metadata.gz: 25fa9822dfe3ab9862a84f5c31ae70df159643a3ecc40c4958ef0ed77b17da54
4
+ data.tar.gz: cf7587eec67a7372df802f4a7ee63e4ef1bc915070137c56d424cb2fcb1e813d
5
5
  SHA512:
6
- metadata.gz: bda0fd4dd07c77a189a8964e4ec150d63a2ff80b324f5ea3e28188e6c1ce2947ca007df3fc5142dc84bc05b17a0cb0cfc960908c78532ec8723f940521ee900a
7
- data.tar.gz: baecc0a27805e48906a23f9f08ddf1af1e22c83015993cef4624151778a5bdee3ad80bc50b6437d65f5a753c0f62f229463d8a8c162ff0faec44f4d6f206dbe9
6
+ metadata.gz: f1a0b7ec3479c88d429fc1b58855473fefc0cdfac9171a72c2124e3fb2e75b3503caf9963d072c74da5a55673af69ca1db9cc751dcf1ece6ae58d997a5837700
7
+ data.tar.gz: c8c410d03fe9dc43a1667d77f0931fd96d4f004ab9a1d45b21bf4eacad68054f3a301ea0241a42a42affd77dda7fbef24f042e1d3a4e4dc702da4b261a306dd5
data/README.md CHANGED
@@ -10,22 +10,28 @@ anystyle --help
10
10
  anystyle [global options] command [command options] [arguments...]
11
11
 
12
12
  VERSION
13
- 1.0.0 (cli 1.0.0, data 1.2.0)
13
+ 1.1.0 (cli 1.0.2, data 1.2.0)
14
14
 
15
15
  GLOBAL OPTIONS
16
- --adapter=name - Set the dictionary adapter (default: ruby)
17
- -f, --format=name - Set the output format (default: ["json"])
18
- --help - Show this message
19
- --[no-]stdout - Print results directly to stdout
20
- --[no-]verbose - Print status messages to stderr
21
- --version - Display the program version
22
- -w, --[no-]overwrite - Allow overwriting existing files
16
+ -F, --finder-model=file - Set the finder model file (default: none)
17
+ -P, --parser-model=file - Set the parser model file (default: none)
18
+ --adapter=name - Set the dictionary adapter (default: ruby)
19
+ -f, --format=name - Set the output format (default: ["json"])
20
+ --pdfinfo=path - Set the path for pdfinfo (default: none)
21
+ --pdftotext=path - Set the path for pdftotext (default: none)
22
+ --help - Show this message
23
+ --[no-]stdout - Print results directly to stdout
24
+ --[no-]verbose - Print status messages to stderr
25
+ --version - Display the program version
26
+ -w, --[no-]overwrite - Allow overwriting existing files
23
27
 
24
28
  COMMANDS
29
+ check - Check tagged documents or references
25
30
  find - Find and extract references from text documents
26
31
  help - Shows a list of commands or help for one command
27
32
  license - Print license information
28
33
  parse - Parse and convert references
34
+ train - Create a new finder or parser model
29
35
 
30
36
  anystyle help find
31
37
  ------------------
@@ -57,7 +63,6 @@ anystyle help find
57
63
 
58
64
  Anlyzing PDF documents currently depends on `pdftotext' which must be
59
65
  installed separately.
60
-
61
66
  EXAMPLES
62
67
  anystyle -f csl,xml find thesis.pdf
63
68
 
@@ -70,6 +75,24 @@ anystyle help find
70
75
  if your document uses a multi-column layout) and save them in BibTeX in
71
76
  `./bib/thesis.bib'.
72
77
 
78
+ anystyle find --crop 72 thesis.pdf -
79
+
80
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
81
+ each page border and print the results to STDOUT.
82
+
83
+ anystyle find --crop 72,28 thesis.pdf -
84
+
85
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
86
+ each page's left and right border, approx. 1cm (28pt) from the top
87
+ and bottom.
88
+
89
+
90
+ COMMAND OPTIONS
91
+ -C, --crop=pt - Set cropping boundary for text extraction (default: none)
92
+ --[no-]layout - Use layout mode for PDF text extraction (default: enabled)
93
+ --[no-]solo - Include references outside of reference sections
94
+
95
+
73
96
  anystyle help parse
74
97
  -------------------
75
98
  COMMAND OPTIONS
@@ -111,6 +134,55 @@ anystyle help parse
111
134
 
112
135
  Extract references from `input.txt' and print them to STDOUT in CSL/JSON.
113
136
 
137
+
138
+ anystyle help check
139
+ -------------------
140
+ NAME
141
+ check - Check tagged documents or references
142
+
143
+ SYNOPSIS
144
+ anystyle [global options] check input
145
+
146
+ DESCRIPTION
147
+ This manual page documents the AnyStyle `check' command. AnyStyle `check'
148
+ analyzes tagged text documents or references.
149
+
150
+ The input argument can be a single TTX or XML document, or a folder
151
+ containing multiple documents.
152
+
153
+ AnyStyle `check' supports the following input formats:
154
+ ttx Tagged document format, used for training the finder model;
155
+ xml References only, XML, suitable for training the parser model.
156
+
157
+ EXAMPLES
158
+ anystyle check training-data.xml
159
+
160
+ Checks all references in the XML file and prints a report to STDOUT.
161
+
162
+
163
+ anystyle help train
164
+ -------------------
165
+ NAME
166
+ train - Create a new finder or parser model
167
+
168
+ SYNOPSIS
169
+ anystyle [global options] train input [output]
170
+
171
+ DESCRIPTION
172
+ This manual page documents the AnyStyle `train' command. AnyStyle `train'
173
+ creates a new finder or parser model based on the supplied training sets.
174
+
175
+ The input argument can be a XML document, or a folder containing multiple
176
+ TTX documents.
177
+
178
+ EXAMPLES
179
+ anystyle train data.xml my-model.mod
180
+
181
+ Creates a new parser model based on the XML training set and saves it
182
+ as `my-model.mod'. To use your model use the global `--finder-model'
183
+ or `--parser-model' flags.
184
+
185
+
114
186
  License
115
187
  -------
116
188
  Copyright 2011-2018 Sylvester Keil. All rights reserved.
@@ -30,6 +30,23 @@ switch ['w', 'overwrite'],
30
30
  switch 'stdout',
31
31
  desc: 'Print results directly to stdout'
32
32
 
33
+ flag ['F', 'finder-model'],
34
+ arg_name: 'file',
35
+ desc: 'Set the finder model file'
36
+
37
+ flag ['P', 'parser-model'],
38
+ arg_name: 'file',
39
+ desc: 'Set the parser model file'
40
+
41
+ flag 'pdftotext',
42
+ arg_name: 'path',
43
+ desc: 'Set the path for pdftotext'
44
+
45
+ flag 'pdfinfo',
46
+ arg_name: 'path',
47
+ desc: 'Set the path for pdfinfo'
48
+
49
+
33
50
  flag 'adapter',
34
51
  default_value: 'ruby',
35
52
  arg_name: 'name',
@@ -46,6 +63,28 @@ flag ['f', 'format'],
46
63
 
47
64
  pre do |opts|
48
65
  AnyStyle::Dictionary.defaults[:adapter] = opts[:adapter]
66
+
67
+ unless opts[:'finder-model'].nil?
68
+ AnyStyle::Finder.defaults[:model] =
69
+ File.expand_path(opts[:'finder-model']).untaint
70
+ end
71
+
72
+ unless opts[:'parser-model'].nil?
73
+ AnyStyle::Parser.defaults[:model] =
74
+ File.expand_path(opts[:'parser-model']).untaint
75
+ end
76
+
77
+ unless opts[:pdftotext].nil?
78
+ AnyStyle::Finder.defaults[:pdftotext] =
79
+ opts[:pdftotext].untaint
80
+ end
81
+
82
+ unless opts[:pdfinfo].nil?
83
+ AnyStyle::Finder.defaults[:pdfinfo] =
84
+ opts[:pdfinfo].untaint
85
+ end
86
+
87
+ AnyStyle
49
88
  end
50
89
 
51
90
 
@@ -84,6 +123,17 @@ EXAMPLES
84
123
  Extract references from `thesis.pdf' in `no-layout' mode (e.g., use this
85
124
  if your document uses a multi-column layout) and save them in BibTeX in
86
125
  `./bib/thesis.bib'.
126
+
127
+ anystyle find --crop 72 thesis.pdf -
128
+
129
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
130
+ each page border and print the results to STDOUT.
131
+
132
+ anystyle find --crop 72,28 thesis.pdf -
133
+
134
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
135
+ each page's left and right border, approx. 1cm (28pt) from the top
136
+ and bottom.
87
137
  }.lstrip
88
138
 
89
139
  arg :input
@@ -93,6 +143,16 @@ command :find do |cmd|
93
143
  default_value: true,
94
144
  desc: 'Use layout mode for PDF text extraction'
95
145
 
146
+ cmd.switch 'solo',
147
+ default_value: false,
148
+ desc: 'Include references outside of reference sections'
149
+
150
+ cmd.flag ['C', 'crop'],
151
+ arg_name: 'pt',
152
+ type: Array,
153
+ must_match: /\d+(,\d+)?|\d+,\d+(,-?\d+){2}/,
154
+ desc: 'Set cropping boundary for text extraction'
155
+
96
156
  cmd.action do |opts, params, args|
97
157
  Commands::Find.new(opts).run(args, params)
98
158
  end
@@ -140,6 +200,56 @@ command :parse do |cmd|
140
200
  end
141
201
  end
142
202
 
203
+ desc 'Check tagged documents or references'
204
+ long_desc %{
205
+ This manual page documents the AnyStyle `check' command. AnyStyle `check'
206
+ analyzes tagged text documents or references.
207
+
208
+ The input argument can be a single TTX or XML document, or a folder
209
+ containing multiple documents.
210
+
211
+ AnyStyle `check' supports the following input formats:
212
+ ttx Tagged document format, used for training the finder model;
213
+ xml References only, XML, suitable for training the parser model.
214
+
215
+ EXAMPLES
216
+ anystyle check training-data.xml
217
+
218
+ Checks all references in the XML file and prints a report to STDOUT.
219
+ }.lstrip
220
+
221
+ arg :input
222
+ command :check do |cmd|
223
+ cmd.action do |opts, params, args|
224
+ Commands::Check.new(opts).run(args, params)
225
+ end
226
+ end
227
+
228
+
229
+ desc 'Create a new finder or parser model'
230
+ long_desc %{
231
+ This manual page documents the AnyStyle `train' command. AnyStyle `train'
232
+ creates a new finder or parser model based on the supplied training sets.
233
+
234
+ The input argument can be a XML document, or a folder containing multiple
235
+ TTX documents.
236
+
237
+ EXAMPLES
238
+ anystyle train data.xml my-model.mod
239
+
240
+ Creates a new parser model based on the XML training set and saves it
241
+ as `my-model.mod'. To use your model use the global `--finder-model'
242
+ or `--parser-model' flags.
243
+ }.lstrip
244
+
245
+ arg :input
246
+ arg :output, :optional
247
+ command :train do |cmd|
248
+ cmd.action do |opts, params, args|
249
+ Commands::Train.new(opts).run(args, params)
250
+ end
251
+ end
252
+
143
253
  desc 'Print license information'
144
254
  command :license do |cmd|
145
255
  cmd.action do
@@ -4,5 +4,7 @@ require 'anystyle'
4
4
 
5
5
  require 'anystyle/cli/version'
6
6
  require 'anystyle/cli/commands/base'
7
+ require 'anystyle/cli/commands/check'
7
8
  require 'anystyle/cli/commands/find'
8
9
  require 'anystyle/cli/commands/parse'
10
+ require 'anystyle/cli/commands/train'
@@ -28,8 +28,11 @@ module AnyStyle
28
28
  options[:format].each(&block)
29
29
  end
30
30
 
31
- def find(input, **opts)
32
- AnyStyle.find(input, format: :wapiti, **opts)
31
+ def find(input, opts = {})
32
+ AnyStyle.find(input,
33
+ format: :wapiti,
34
+ layout: opts[:layout],
35
+ crop: opts[:crop].nil? ? nil : opts[:crop].map(&:to_i))
33
36
  end
34
37
 
35
38
  def parse(input)
@@ -67,7 +70,12 @@ module AnyStyle
67
70
  end
68
71
 
69
72
  def set_output_folder(path)
70
- @output_folder = Pathname.new(path).expand_path unless path.nil?
73
+ case path
74
+ when nil, '-'
75
+ options[:stdout] = true
76
+ else
77
+ @output_folder = Pathname.new(path).expand_path
78
+ end
71
79
  ensure
72
80
  unless @output_folder.nil?
73
81
  if @output_folder.exist?
@@ -83,16 +91,32 @@ module AnyStyle
83
91
  STDERR.puts(*args) if verbose?
84
92
  end
85
93
 
94
+ def report(error, file)
95
+ STDERR.puts "Error processing `#{file}'"
96
+ STDERR.puts " #{error.message}"
97
+ STDERR.puts " #{error.backtrace[0]}"
98
+ STDERR.puts " #{error.backtrace[1]}"
99
+ STDERR.puts " ..."
100
+ end
101
+
86
102
  def walk(input)
87
103
  path = Pathname(input).expand_path
88
104
  raise ArgumentError, "path does not exist: #{input}" unless path.exist?
89
105
 
90
106
  if path.directory?
91
107
  path.each_child do |file|
92
- yield file, path unless file.directory?
108
+ begin
109
+ yield file, path unless file.directory?
110
+ rescue => e
111
+ report e, file.relative_path_from(path)
112
+ end
93
113
  end
94
114
  else
95
- yield path, path.dirname
115
+ begin
116
+ yield path, path.dirname
117
+ rescue => e
118
+ report e, path.basename
119
+ end
96
120
  end
97
121
  end
98
122
 
@@ -0,0 +1,41 @@
1
+ module AnyStyle
2
+ module CLI
3
+ module Commands
4
+ class Check < Base
5
+ def run(args, params)
6
+ walk args[0] do |path|
7
+ print 'Checking %.25s' % "#{path.basename}....................."
8
+ start = Time.now
9
+ stats = check path
10
+ report stats, Time.now - start
11
+ end
12
+ end
13
+
14
+ def check(path)
15
+ case path.extname
16
+ when '.ttx'
17
+ AnyStyle.finder.check path.to_s.untaint
18
+ when '.xml'
19
+ AnyStyle.parser.check path.to_s.untaint
20
+ else
21
+ raise ArgumentError, "cannot check untagged input: #{path}"
22
+ end
23
+ end
24
+
25
+ def report(stats, time)
26
+ if stats[:token][:errors] == 0
27
+ puts ' ✓ %2ds' % time
28
+ else
29
+ puts '%4d seq %6.2f%% %6d tok %5.2f%% %2ds' % [
30
+ stats[:sequence][:errors],
31
+ stats[:sequence][:rate],
32
+ stats[:token][:errors],
33
+ stats[:token][:rate],
34
+ time
35
+ ]
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -6,8 +6,8 @@ module AnyStyle
6
6
  set_output_folder args[1]
7
7
  walk args[0] do |path, base_path|
8
8
  say "Analyzing #{path.relative_path_from(base_path)} ..."
9
- doc = find(path.to_s.untaint, layout: params[:layout])
10
- ref = doc[0].references
9
+ doc = find(path.to_s.untaint, params)
10
+ ref = doc[0].references(normalize_blocks: !params[:solo])
11
11
 
12
12
  if ref.length == 0
13
13
  say "no references found."
@@ -17,9 +17,9 @@ module AnyStyle
17
17
  each_format do |fmt|
18
18
  case fmt
19
19
  when 'ttx'
20
- res = doc.to_txt tagged: true
20
+ res = doc.to_s tagged: true
21
21
  when 'txt'
22
- res = doc.to_txt tagged: false
22
+ res = doc.to_s tagged: false
23
23
  when 'ref'
24
24
  res = ref.join("\n")
25
25
  else
@@ -0,0 +1,41 @@
1
+ module AnyStyle
2
+ module CLI
3
+ module Commands
4
+ class Train < Base
5
+ def run(args, params)
6
+ check_no_overwrite! args[1]
7
+
8
+ Wapiti.debug!
9
+ model = train(args[0])
10
+
11
+ if args[1].nil?
12
+ model.save
13
+ else
14
+ model.save File.expand_path(args[1]).untaint
15
+ end
16
+ end
17
+
18
+ def train(path)
19
+ case
20
+ when File.extname(path) == '.xml'
21
+ AnyStyle.parser.train path.to_s.untaint
22
+ AnyStyle.parser.model
23
+ when File.directory?(path)
24
+ AnyStyle.finder.train Dir[File.join(path, '*.ttx')].map(&:untaint)
25
+ AnyStyle.finder.model
26
+ else
27
+ raise ArgumentError, "cannot train input: #{path}"
28
+ end
29
+ end
30
+
31
+ def check_no_overwrite!(path)
32
+ if !overwrite? && (path.nil? || File.exist?(path))
33
+ raise RuntimeError,
34
+ "file exists, use --overwrite to force saving: #{path}"
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+
@@ -1,5 +1,5 @@
1
1
  module AnyStyle
2
2
  module CLI
3
- VERSION = '1.0.1'.freeze
3
+ VERSION = '1.3.1'.freeze
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sylvester Keil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-06-06 00:00:00.000000000 Z
11
+ date: 2020-05-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: anystyle
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '1.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bibtex-ruby
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '4.4'
33
+ version: '5.1'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '4.4'
40
+ version: '5.1'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: gli
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -65,8 +65,10 @@ files:
65
65
  - bin/anystyle
66
66
  - lib/anystyle/cli.rb
67
67
  - lib/anystyle/cli/commands/base.rb
68
+ - lib/anystyle/cli/commands/check.rb
68
69
  - lib/anystyle/cli/commands/find.rb
69
70
  - lib/anystyle/cli/commands/parse.rb
71
+ - lib/anystyle/cli/commands/train.rb
70
72
  - lib/anystyle/cli/version.rb
71
73
  homepage: http://anystyle.io
72
74
  licenses:
@@ -87,8 +89,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
89
  - !ruby/object:Gem::Version
88
90
  version: '0'
89
91
  requirements: []
90
- rubyforge_project:
91
- rubygems_version: 2.7.4
92
+ rubygems_version: 3.1.2
92
93
  signing_key:
93
94
  specification_version: 4
94
95
  summary: AnyStyle CLI