anystyle-cli 1.0.1 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +81 -9
- data/bin/anystyle +110 -0
- data/lib/anystyle/cli.rb +2 -0
- data/lib/anystyle/cli/commands/base.rb +29 -5
- data/lib/anystyle/cli/commands/check.rb +41 -0
- data/lib/anystyle/cli/commands/find.rb +4 -4
- data/lib/anystyle/cli/commands/train.rb +41 -0
- data/lib/anystyle/cli/version.rb +1 -1
- metadata +9 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 25fa9822dfe3ab9862a84f5c31ae70df159643a3ecc40c4958ef0ed77b17da54
|
4
|
+
data.tar.gz: cf7587eec67a7372df802f4a7ee63e4ef1bc915070137c56d424cb2fcb1e813d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1a0b7ec3479c88d429fc1b58855473fefc0cdfac9171a72c2124e3fb2e75b3503caf9963d072c74da5a55673af69ca1db9cc751dcf1ece6ae58d997a5837700
|
7
|
+
data.tar.gz: c8c410d03fe9dc43a1667d77f0931fd96d4f004ab9a1d45b21bf4eacad68054f3a301ea0241a42a42affd77dda7fbef24f042e1d3a4e4dc702da4b261a306dd5
|
data/README.md
CHANGED
@@ -10,22 +10,28 @@ anystyle --help
|
|
10
10
|
anystyle [global options] command [command options] [arguments...]
|
11
11
|
|
12
12
|
VERSION
|
13
|
-
1.
|
13
|
+
1.1.0 (cli 1.0.2, data 1.2.0)
|
14
14
|
|
15
15
|
GLOBAL OPTIONS
|
16
|
-
--
|
17
|
-
-
|
18
|
-
--
|
19
|
-
--
|
20
|
-
--
|
21
|
-
--
|
22
|
-
|
16
|
+
-F, --finder-model=file - Set the finder model file (default: none)
|
17
|
+
-P, --parser-model=file - Set the parser model file (default: none)
|
18
|
+
--adapter=name - Set the dictionary adapter (default: ruby)
|
19
|
+
-f, --format=name - Set the output format (default: ["json"])
|
20
|
+
--pdfinfo=path - Set the path for pdfinfo (default: none)
|
21
|
+
--pdftotext=path - Set the path for pdftotext (default: none)
|
22
|
+
--help - Show this message
|
23
|
+
--[no-]stdout - Print results directly to stdout
|
24
|
+
--[no-]verbose - Print status messages to stderr
|
25
|
+
--version - Display the program version
|
26
|
+
-w, --[no-]overwrite - Allow overwriting existing files
|
23
27
|
|
24
28
|
COMMANDS
|
29
|
+
check - Check tagged documents or references
|
25
30
|
find - Find and extract references from text documents
|
26
31
|
help - Shows a list of commands or help for one command
|
27
32
|
license - Print license information
|
28
33
|
parse - Parse and convert references
|
34
|
+
train - Create a new finder or parser model
|
29
35
|
|
30
36
|
anystyle help find
|
31
37
|
------------------
|
@@ -57,7 +63,6 @@ anystyle help find
|
|
57
63
|
|
58
64
|
Anlyzing PDF documents currently depends on `pdftotext' which must be
|
59
65
|
installed separately.
|
60
|
-
|
61
66
|
EXAMPLES
|
62
67
|
anystyle -f csl,xml find thesis.pdf
|
63
68
|
|
@@ -70,6 +75,24 @@ anystyle help find
|
|
70
75
|
if your document uses a multi-column layout) and save them in BibTeX in
|
71
76
|
`./bib/thesis.bib'.
|
72
77
|
|
78
|
+
anystyle find --crop 72 thesis.pdf -
|
79
|
+
|
80
|
+
Extract references from `thesis.pdf' cropping away one inch (72pt) from
|
81
|
+
each page border and print the results to STDOUT.
|
82
|
+
|
83
|
+
anystyle find --crop 72,28 thesis.pdf -
|
84
|
+
|
85
|
+
Extract references from `thesis.pdf' cropping away one inch (72pt) from
|
86
|
+
each page's left and right border, approx. 1cm (28pt) from the top
|
87
|
+
and bottom.
|
88
|
+
|
89
|
+
|
90
|
+
COMMAND OPTIONS
|
91
|
+
-C, --crop=pt - Set cropping boundary for text extraction (default: none)
|
92
|
+
--[no-]layout - Use layout mode for PDF text extraction (default: enabled)
|
93
|
+
--[no-]solo - Include references outside of reference sections
|
94
|
+
|
95
|
+
|
73
96
|
anystyle help parse
|
74
97
|
-------------------
|
75
98
|
COMMAND OPTIONS
|
@@ -111,6 +134,55 @@ anystyle help parse
|
|
111
134
|
|
112
135
|
Extract references from `input.txt' and print them to STDOUT in CSL/JSON.
|
113
136
|
|
137
|
+
|
138
|
+
anystyle help check
|
139
|
+
-------------------
|
140
|
+
NAME
|
141
|
+
check - Check tagged documents or references
|
142
|
+
|
143
|
+
SYNOPSIS
|
144
|
+
anystyle [global options] check input
|
145
|
+
|
146
|
+
DESCRIPTION
|
147
|
+
This manual page documents the AnyStyle `check' command. AnyStyle `check'
|
148
|
+
analyzes tagged text documents or references.
|
149
|
+
|
150
|
+
The input argument can be a single TTX or XML document, or a folder
|
151
|
+
containing multiple documents.
|
152
|
+
|
153
|
+
AnyStyle `check' supports the following input formats:
|
154
|
+
ttx Tagged document format, used for training the finder model;
|
155
|
+
xml References only, XML, suitable for training the parser model.
|
156
|
+
|
157
|
+
EXAMPLES
|
158
|
+
anystyle check training-data.xml
|
159
|
+
|
160
|
+
Checks all references in the XML file and prints a report to STDOUT.
|
161
|
+
|
162
|
+
|
163
|
+
anystyle help train
|
164
|
+
-------------------
|
165
|
+
NAME
|
166
|
+
train - Create a new finder or parser model
|
167
|
+
|
168
|
+
SYNOPSIS
|
169
|
+
anystyle [global options] train input [output]
|
170
|
+
|
171
|
+
DESCRIPTION
|
172
|
+
This manual page documents the AnyStyle `train' command. AnyStyle `train'
|
173
|
+
creates a new finder or parser model based on the supplied training sets.
|
174
|
+
|
175
|
+
The input argument can be a XML document, or a folder containing multiple
|
176
|
+
TTX documents.
|
177
|
+
|
178
|
+
EXAMPLES
|
179
|
+
anystyle train data.xml my-model.mod
|
180
|
+
|
181
|
+
Creates a new parser model based on the XML training set and saves it
|
182
|
+
as `my-model.mod'. To use your model use the global `--finder-model'
|
183
|
+
or `--parser-model' flags.
|
184
|
+
|
185
|
+
|
114
186
|
License
|
115
187
|
-------
|
116
188
|
Copyright 2011-2018 Sylvester Keil. All rights reserved.
|
data/bin/anystyle
CHANGED
@@ -30,6 +30,23 @@ switch ['w', 'overwrite'],
|
|
30
30
|
switch 'stdout',
|
31
31
|
desc: 'Print results directly to stdout'
|
32
32
|
|
33
|
+
flag ['F', 'finder-model'],
|
34
|
+
arg_name: 'file',
|
35
|
+
desc: 'Set the finder model file'
|
36
|
+
|
37
|
+
flag ['P', 'parser-model'],
|
38
|
+
arg_name: 'file',
|
39
|
+
desc: 'Set the parser model file'
|
40
|
+
|
41
|
+
flag 'pdftotext',
|
42
|
+
arg_name: 'path',
|
43
|
+
desc: 'Set the path for pdftotext'
|
44
|
+
|
45
|
+
flag 'pdfinfo',
|
46
|
+
arg_name: 'path',
|
47
|
+
desc: 'Set the path for pdfinfo'
|
48
|
+
|
49
|
+
|
33
50
|
flag 'adapter',
|
34
51
|
default_value: 'ruby',
|
35
52
|
arg_name: 'name',
|
@@ -46,6 +63,28 @@ flag ['f', 'format'],
|
|
46
63
|
|
47
64
|
pre do |opts|
|
48
65
|
AnyStyle::Dictionary.defaults[:adapter] = opts[:adapter]
|
66
|
+
|
67
|
+
unless opts[:'finder-model'].nil?
|
68
|
+
AnyStyle::Finder.defaults[:model] =
|
69
|
+
File.expand_path(opts[:'finder-model']).untaint
|
70
|
+
end
|
71
|
+
|
72
|
+
unless opts[:'parser-model'].nil?
|
73
|
+
AnyStyle::Parser.defaults[:model] =
|
74
|
+
File.expand_path(opts[:'parser-model']).untaint
|
75
|
+
end
|
76
|
+
|
77
|
+
unless opts[:pdftotext].nil?
|
78
|
+
AnyStyle::Finder.defaults[:pdftotext] =
|
79
|
+
opts[:pdftotext].untaint
|
80
|
+
end
|
81
|
+
|
82
|
+
unless opts[:pdfinfo].nil?
|
83
|
+
AnyStyle::Finder.defaults[:pdfinfo] =
|
84
|
+
opts[:pdfinfo].untaint
|
85
|
+
end
|
86
|
+
|
87
|
+
AnyStyle
|
49
88
|
end
|
50
89
|
|
51
90
|
|
@@ -84,6 +123,17 @@ EXAMPLES
|
|
84
123
|
Extract references from `thesis.pdf' in `no-layout' mode (e.g., use this
|
85
124
|
if your document uses a multi-column layout) and save them in BibTeX in
|
86
125
|
`./bib/thesis.bib'.
|
126
|
+
|
127
|
+
anystyle find --crop 72 thesis.pdf -
|
128
|
+
|
129
|
+
Extract references from `thesis.pdf' cropping away one inch (72pt) from
|
130
|
+
each page border and print the results to STDOUT.
|
131
|
+
|
132
|
+
anystyle find --crop 72,28 thesis.pdf -
|
133
|
+
|
134
|
+
Extract references from `thesis.pdf' cropping away one inch (72pt) from
|
135
|
+
each page's left and right border, approx. 1cm (28pt) from the top
|
136
|
+
and bottom.
|
87
137
|
}.lstrip
|
88
138
|
|
89
139
|
arg :input
|
@@ -93,6 +143,16 @@ command :find do |cmd|
|
|
93
143
|
default_value: true,
|
94
144
|
desc: 'Use layout mode for PDF text extraction'
|
95
145
|
|
146
|
+
cmd.switch 'solo',
|
147
|
+
default_value: false,
|
148
|
+
desc: 'Include references outside of reference sections'
|
149
|
+
|
150
|
+
cmd.flag ['C', 'crop'],
|
151
|
+
arg_name: 'pt',
|
152
|
+
type: Array,
|
153
|
+
must_match: /\d+(,\d+)?|\d+,\d+(,-?\d+){2}/,
|
154
|
+
desc: 'Set cropping boundary for text extraction'
|
155
|
+
|
96
156
|
cmd.action do |opts, params, args|
|
97
157
|
Commands::Find.new(opts).run(args, params)
|
98
158
|
end
|
@@ -140,6 +200,56 @@ command :parse do |cmd|
|
|
140
200
|
end
|
141
201
|
end
|
142
202
|
|
203
|
+
desc 'Check tagged documents or references'
|
204
|
+
long_desc %{
|
205
|
+
This manual page documents the AnyStyle `check' command. AnyStyle `check'
|
206
|
+
analyzes tagged text documents or references.
|
207
|
+
|
208
|
+
The input argument can be a single TTX or XML document, or a folder
|
209
|
+
containing multiple documents.
|
210
|
+
|
211
|
+
AnyStyle `check' supports the following input formats:
|
212
|
+
ttx Tagged document format, used for training the finder model;
|
213
|
+
xml References only, XML, suitable for training the parser model.
|
214
|
+
|
215
|
+
EXAMPLES
|
216
|
+
anystyle check training-data.xml
|
217
|
+
|
218
|
+
Checks all references in the XML file and prints a report to STDOUT.
|
219
|
+
}.lstrip
|
220
|
+
|
221
|
+
arg :input
|
222
|
+
command :check do |cmd|
|
223
|
+
cmd.action do |opts, params, args|
|
224
|
+
Commands::Check.new(opts).run(args, params)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
desc 'Create a new finder or parser model'
|
230
|
+
long_desc %{
|
231
|
+
This manual page documents the AnyStyle `train' command. AnyStyle `train'
|
232
|
+
creates a new finder or parser model based on the supplied training sets.
|
233
|
+
|
234
|
+
The input argument can be a XML document, or a folder containing multiple
|
235
|
+
TTX documents.
|
236
|
+
|
237
|
+
EXAMPLES
|
238
|
+
anystyle train data.xml my-model.mod
|
239
|
+
|
240
|
+
Creates a new parser model based on the XML training set and saves it
|
241
|
+
as `my-model.mod'. To use your model use the global `--finder-model'
|
242
|
+
or `--parser-model' flags.
|
243
|
+
}.lstrip
|
244
|
+
|
245
|
+
arg :input
|
246
|
+
arg :output, :optional
|
247
|
+
command :train do |cmd|
|
248
|
+
cmd.action do |opts, params, args|
|
249
|
+
Commands::Train.new(opts).run(args, params)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
143
253
|
desc 'Print license information'
|
144
254
|
command :license do |cmd|
|
145
255
|
cmd.action do
|
data/lib/anystyle/cli.rb
CHANGED
@@ -28,8 +28,11 @@ module AnyStyle
|
|
28
28
|
options[:format].each(&block)
|
29
29
|
end
|
30
30
|
|
31
|
-
def find(input,
|
32
|
-
AnyStyle.find(input,
|
31
|
+
def find(input, opts = {})
|
32
|
+
AnyStyle.find(input,
|
33
|
+
format: :wapiti,
|
34
|
+
layout: opts[:layout],
|
35
|
+
crop: opts[:crop].nil? ? nil : opts[:crop].map(&:to_i))
|
33
36
|
end
|
34
37
|
|
35
38
|
def parse(input)
|
@@ -67,7 +70,12 @@ module AnyStyle
|
|
67
70
|
end
|
68
71
|
|
69
72
|
def set_output_folder(path)
|
70
|
-
|
73
|
+
case path
|
74
|
+
when nil, '-'
|
75
|
+
options[:stdout] = true
|
76
|
+
else
|
77
|
+
@output_folder = Pathname.new(path).expand_path
|
78
|
+
end
|
71
79
|
ensure
|
72
80
|
unless @output_folder.nil?
|
73
81
|
if @output_folder.exist?
|
@@ -83,16 +91,32 @@ module AnyStyle
|
|
83
91
|
STDERR.puts(*args) if verbose?
|
84
92
|
end
|
85
93
|
|
94
|
+
def report(error, file)
|
95
|
+
STDERR.puts "Error processing `#{file}'"
|
96
|
+
STDERR.puts " #{error.message}"
|
97
|
+
STDERR.puts " #{error.backtrace[0]}"
|
98
|
+
STDERR.puts " #{error.backtrace[1]}"
|
99
|
+
STDERR.puts " ..."
|
100
|
+
end
|
101
|
+
|
86
102
|
def walk(input)
|
87
103
|
path = Pathname(input).expand_path
|
88
104
|
raise ArgumentError, "path does not exist: #{input}" unless path.exist?
|
89
105
|
|
90
106
|
if path.directory?
|
91
107
|
path.each_child do |file|
|
92
|
-
|
108
|
+
begin
|
109
|
+
yield file, path unless file.directory?
|
110
|
+
rescue => e
|
111
|
+
report e, file.relative_path_from(path)
|
112
|
+
end
|
93
113
|
end
|
94
114
|
else
|
95
|
-
|
115
|
+
begin
|
116
|
+
yield path, path.dirname
|
117
|
+
rescue => e
|
118
|
+
report e, path.basename
|
119
|
+
end
|
96
120
|
end
|
97
121
|
end
|
98
122
|
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module CLI
|
3
|
+
module Commands
|
4
|
+
class Check < Base
|
5
|
+
def run(args, params)
|
6
|
+
walk args[0] do |path|
|
7
|
+
print 'Checking %.25s' % "#{path.basename}....................."
|
8
|
+
start = Time.now
|
9
|
+
stats = check path
|
10
|
+
report stats, Time.now - start
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def check(path)
|
15
|
+
case path.extname
|
16
|
+
when '.ttx'
|
17
|
+
AnyStyle.finder.check path.to_s.untaint
|
18
|
+
when '.xml'
|
19
|
+
AnyStyle.parser.check path.to_s.untaint
|
20
|
+
else
|
21
|
+
raise ArgumentError, "cannot check untagged input: #{path}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def report(stats, time)
|
26
|
+
if stats[:token][:errors] == 0
|
27
|
+
puts ' ✓ %2ds' % time
|
28
|
+
else
|
29
|
+
puts '%4d seq %6.2f%% %6d tok %5.2f%% %2ds' % [
|
30
|
+
stats[:sequence][:errors],
|
31
|
+
stats[:sequence][:rate],
|
32
|
+
stats[:token][:errors],
|
33
|
+
stats[:token][:rate],
|
34
|
+
time
|
35
|
+
]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -6,8 +6,8 @@ module AnyStyle
|
|
6
6
|
set_output_folder args[1]
|
7
7
|
walk args[0] do |path, base_path|
|
8
8
|
say "Analyzing #{path.relative_path_from(base_path)} ..."
|
9
|
-
doc = find(path.to_s.untaint,
|
10
|
-
ref = doc[0].references
|
9
|
+
doc = find(path.to_s.untaint, params)
|
10
|
+
ref = doc[0].references(normalize_blocks: !params[:solo])
|
11
11
|
|
12
12
|
if ref.length == 0
|
13
13
|
say "no references found."
|
@@ -17,9 +17,9 @@ module AnyStyle
|
|
17
17
|
each_format do |fmt|
|
18
18
|
case fmt
|
19
19
|
when 'ttx'
|
20
|
-
res = doc.
|
20
|
+
res = doc.to_s tagged: true
|
21
21
|
when 'txt'
|
22
|
-
res = doc.
|
22
|
+
res = doc.to_s tagged: false
|
23
23
|
when 'ref'
|
24
24
|
res = ref.join("\n")
|
25
25
|
else
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module CLI
|
3
|
+
module Commands
|
4
|
+
class Train < Base
|
5
|
+
def run(args, params)
|
6
|
+
check_no_overwrite! args[1]
|
7
|
+
|
8
|
+
Wapiti.debug!
|
9
|
+
model = train(args[0])
|
10
|
+
|
11
|
+
if args[1].nil?
|
12
|
+
model.save
|
13
|
+
else
|
14
|
+
model.save File.expand_path(args[1]).untaint
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def train(path)
|
19
|
+
case
|
20
|
+
when File.extname(path) == '.xml'
|
21
|
+
AnyStyle.parser.train path.to_s.untaint
|
22
|
+
AnyStyle.parser.model
|
23
|
+
when File.directory?(path)
|
24
|
+
AnyStyle.finder.train Dir[File.join(path, '*.ttx')].map(&:untaint)
|
25
|
+
AnyStyle.finder.model
|
26
|
+
else
|
27
|
+
raise ArgumentError, "cannot train input: #{path}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def check_no_overwrite!(path)
|
32
|
+
if !overwrite? && (path.nil? || File.exist?(path))
|
33
|
+
raise RuntimeError,
|
34
|
+
"file exists, use --overwrite to force saving: #{path}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
data/lib/anystyle/cli/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: anystyle
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
19
|
+
version: '1.3'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '1.
|
26
|
+
version: '1.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bibtex-ruby
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '5.1'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '5.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: gli
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -65,8 +65,10 @@ files:
|
|
65
65
|
- bin/anystyle
|
66
66
|
- lib/anystyle/cli.rb
|
67
67
|
- lib/anystyle/cli/commands/base.rb
|
68
|
+
- lib/anystyle/cli/commands/check.rb
|
68
69
|
- lib/anystyle/cli/commands/find.rb
|
69
70
|
- lib/anystyle/cli/commands/parse.rb
|
71
|
+
- lib/anystyle/cli/commands/train.rb
|
70
72
|
- lib/anystyle/cli/version.rb
|
71
73
|
homepage: http://anystyle.io
|
72
74
|
licenses:
|
@@ -87,8 +89,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
89
|
- !ruby/object:Gem::Version
|
88
90
|
version: '0'
|
89
91
|
requirements: []
|
90
|
-
|
91
|
-
rubygems_version: 2.7.4
|
92
|
+
rubygems_version: 3.1.2
|
92
93
|
signing_key:
|
93
94
|
specification_version: 4
|
94
95
|
summary: AnyStyle CLI
|