anystyle-cli 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +79 -9
- data/bin/anystyle +91 -0
- data/lib/anystyle/cli.rb +2 -0
- data/lib/anystyle/cli/commands/base.rb +11 -3
- data/lib/anystyle/cli/commands/check.rb +41 -0
- data/lib/anystyle/cli/commands/find.rb +2 -2
- data/lib/anystyle/cli/commands/train.rb +41 -0
- data/lib/anystyle/cli/version.rb +1 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c6c4e6580d9495afd965e3401f030cee94925747
|
4
|
+
data.tar.gz: 56e3842f3b3db05ea1a7a5b17ab4f72b5ff07773
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 01b90daac809b52ac69e70550952596512ef8df0bdc5503faabf4a173e83d4bf045dd2b0fcf4ae0bb28b165b182885e6f997f5f6f764638ed7b19dda67a1a87d
|
7
|
+
data.tar.gz: be29161a8a6ba925f2f21a4d50ad1cb41c93d945977cc3cdf026b58383e3c1d0f979c44966cb32934c1ce2d24d0bbba7751e8b9f771927c9c43d2eb6b8385546
|
data/README.md
CHANGED
@@ -10,22 +10,26 @@ anystyle --help
|
|
10
10
|
anystyle [global options] command [command options] [arguments...]
|
11
11
|
|
12
12
|
VERSION
|
13
|
-
1.
|
13
|
+
1.1.0 (cli 1.0.2, data 1.2.0)
|
14
14
|
|
15
15
|
GLOBAL OPTIONS
|
16
|
-
--
|
17
|
-
-
|
18
|
-
--
|
19
|
-
--
|
20
|
-
--
|
21
|
-
--
|
22
|
-
|
16
|
+
-F, --finder-model=file - Set the finder model file (default: none)
|
17
|
+
-P, --parser-model=file - Set the parser model file (default: none)
|
18
|
+
--adapter=name - Set the dictionary adapter (default: ruby)
|
19
|
+
-f, --format=name - Set the output format (default: ["json"])
|
20
|
+
--help - Show this message
|
21
|
+
--[no-]stdout - Print results directly to stdout
|
22
|
+
--[no-]verbose - Print status messages to stderr
|
23
|
+
--version - Display the program version
|
24
|
+
-w, --[no-]overwrite - Allow overwriting existing files
|
23
25
|
|
24
26
|
COMMANDS
|
27
|
+
check - Check tagged documents or references
|
25
28
|
find - Find and extract references from text documents
|
26
29
|
help - Shows a list of commands or help for one command
|
27
30
|
license - Print license information
|
28
31
|
parse - Parse and convert references
|
32
|
+
train - Create a new finder or parser model
|
29
33
|
|
30
34
|
anystyle help find
|
31
35
|
------------------
|
@@ -57,7 +61,6 @@ anystyle help find
|
|
57
61
|
|
58
62
|
Anlyzing PDF documents currently depends on `pdftotext' which must be
|
59
63
|
installed separately.
|
60
|
-
|
61
64
|
EXAMPLES
|
62
65
|
anystyle -f csl,xml find thesis.pdf
|
63
66
|
|
@@ -70,6 +73,24 @@ anystyle help find
|
|
70
73
|
if your document uses a multi-column layout) and save them in BibTeX in
|
71
74
|
`./bib/thesis.bib'.
|
72
75
|
|
76
|
+
anystyle find --crop 72 thesis.pdf -
|
77
|
+
|
78
|
+
Extract references from `thesis.pdf' cropping away one inch (72pt) from
|
79
|
+
each page border and print the results to STDOUT.
|
80
|
+
|
81
|
+
anystyle find --crop 72,28 thesis.pdf -
|
82
|
+
|
83
|
+
Extract references from `thesis.pdf' cropping away one inch (72pt) from
|
84
|
+
each page's left and right border, approx. 1cm (28pt) from the top
|
85
|
+
and bottom.
|
86
|
+
|
87
|
+
|
88
|
+
COMMAND OPTIONS
|
89
|
+
-C, --crop=pt - Set cropping boundary for text extraction (default: none)
|
90
|
+
--[no-]layout - Use layout mode for PDF text extraction (default: enabled)
|
91
|
+
--[no-]solo - Include references outside of reference sections
|
92
|
+
|
93
|
+
|
73
94
|
anystyle help parse
|
74
95
|
-------------------
|
75
96
|
COMMAND OPTIONS
|
@@ -111,6 +132,55 @@ anystyle help parse
|
|
111
132
|
|
112
133
|
Extract references from `input.txt' and print them to STDOUT in CSL/JSON.
|
113
134
|
|
135
|
+
|
136
|
+
anystyle help check
|
137
|
+
-------------------
|
138
|
+
NAME
|
139
|
+
check - Check tagged documents or references
|
140
|
+
|
141
|
+
SYNOPSIS
|
142
|
+
anystyle [global options] check input
|
143
|
+
|
144
|
+
DESCRIPTION
|
145
|
+
This manual page documents the AnyStyle `check' command. AnyStyle `check'
|
146
|
+
analyzes tagged text documents or references.
|
147
|
+
|
148
|
+
The input argument can be a single TTX or XML document, or a folder
|
149
|
+
containing multiple documents.
|
150
|
+
|
151
|
+
AnyStyle `check' supports the following input formats:
|
152
|
+
ttx Tagged document format, used for training the finder model;
|
153
|
+
xml References only, XML, suitable for training the parser model.
|
154
|
+
|
155
|
+
EXAMPLES
|
156
|
+
anystyle check training-data.xml
|
157
|
+
|
158
|
+
Checks all references in the XML file and prints a report to STDOUT.
|
159
|
+
|
160
|
+
|
161
|
+
anystyle help train
|
162
|
+
-------------------
|
163
|
+
NAME
|
164
|
+
train - Create a new finder or parser model
|
165
|
+
|
166
|
+
SYNOPSIS
|
167
|
+
anystyle [global options] train input [output]
|
168
|
+
|
169
|
+
DESCRIPTION
|
170
|
+
This manual page documents the AnyStyle `train' command. AnyStyle `train'
|
171
|
+
creates a new finder or parser model based on the supplied training sets.
|
172
|
+
|
173
|
+
The input argument can be a XML document, or a folder containing multiple
|
174
|
+
TTX documents.
|
175
|
+
|
176
|
+
EXAMPLES
|
177
|
+
anystyle train data.xml my-model.mod
|
178
|
+
|
179
|
+
Creates a new parser model based on the XML training set and saves it
|
180
|
+
as `my-model.mod'. To use your model use the global `--finder-model'
|
181
|
+
or `--parser-model' flags.
|
182
|
+
|
183
|
+
|
114
184
|
License
|
115
185
|
-------
|
116
186
|
Copyright 2011-2018 Sylvester Keil. All rights reserved.
|
data/bin/anystyle
CHANGED
@@ -30,6 +30,14 @@ switch ['w', 'overwrite'],
|
|
30
30
|
switch 'stdout',
|
31
31
|
desc: 'Print results directly to stdout'
|
32
32
|
|
33
|
+
flag ['F', 'finder-model'],
|
34
|
+
arg_name: 'file',
|
35
|
+
desc: 'Set the finder model file'
|
36
|
+
|
37
|
+
flag ['P', 'parser-model'],
|
38
|
+
arg_name: 'file',
|
39
|
+
desc: 'Set the parser model file'
|
40
|
+
|
33
41
|
flag 'adapter',
|
34
42
|
default_value: 'ruby',
|
35
43
|
arg_name: 'name',
|
@@ -46,6 +54,18 @@ flag ['f', 'format'],
|
|
46
54
|
|
47
55
|
pre do |opts|
|
48
56
|
AnyStyle::Dictionary.defaults[:adapter] = opts[:adapter]
|
57
|
+
|
58
|
+
unless opts[:'finder-model'].nil?
|
59
|
+
AnyStyle::Finder.defaults[:model] =
|
60
|
+
File.expand_path(opts[:'finder-model']).untaint
|
61
|
+
end
|
62
|
+
|
63
|
+
unless opts[:'parser-model'].nil?
|
64
|
+
AnyStyle::Parser.defaults[:model] =
|
65
|
+
File.expand_path(opts[:'parser-model']).untaint
|
66
|
+
end
|
67
|
+
|
68
|
+
AnyStyle
|
49
69
|
end
|
50
70
|
|
51
71
|
|
@@ -84,6 +104,17 @@ EXAMPLES
|
|
84
104
|
Extract references from `thesis.pdf' in `no-layout' mode (e.g., use this
|
85
105
|
if your document uses a multi-column layout) and save them in BibTeX in
|
86
106
|
`./bib/thesis.bib'.
|
107
|
+
|
108
|
+
anystyle find --crop 72 thesis.pdf -
|
109
|
+
|
110
|
+
Extract references from `thesis.pdf' cropping away one inch (72pt) from
|
111
|
+
each page border and print the results to STDOUT.
|
112
|
+
|
113
|
+
anystyle find --crop 72,28 thesis.pdf -
|
114
|
+
|
115
|
+
Extract references from `thesis.pdf' cropping away one inch (72pt) from
|
116
|
+
each page's left and right border, approx. 1cm (28pt) from the top
|
117
|
+
and bottom.
|
87
118
|
}.lstrip
|
88
119
|
|
89
120
|
arg :input
|
@@ -93,6 +124,16 @@ command :find do |cmd|
|
|
93
124
|
default_value: true,
|
94
125
|
desc: 'Use layout mode for PDF text extraction'
|
95
126
|
|
127
|
+
cmd.switch 'solo',
|
128
|
+
default_value: false,
|
129
|
+
desc: 'Include references outside of reference sections'
|
130
|
+
|
131
|
+
cmd.flag ['C', 'crop'],
|
132
|
+
arg_name: 'pt',
|
133
|
+
type: Array,
|
134
|
+
must_match: /\d+(,\d+)?|\d+,\d+(,-?\d+){2}/,
|
135
|
+
desc: 'Set cropping boundary for text extraction'
|
136
|
+
|
96
137
|
cmd.action do |opts, params, args|
|
97
138
|
Commands::Find.new(opts).run(args, params)
|
98
139
|
end
|
@@ -140,6 +181,56 @@ command :parse do |cmd|
|
|
140
181
|
end
|
141
182
|
end
|
142
183
|
|
184
|
+
desc 'Check tagged documents or references'
|
185
|
+
long_desc %{
|
186
|
+
This manual page documents the AnyStyle `check' command. AnyStyle `check'
|
187
|
+
analyzes tagged text documents or references.
|
188
|
+
|
189
|
+
The input argument can be a single TTX or XML document, or a folder
|
190
|
+
containing multiple documents.
|
191
|
+
|
192
|
+
AnyStyle `check' supports the following input formats:
|
193
|
+
ttx Tagged document format, used for training the finder model;
|
194
|
+
xml References only, XML, suitable for training the parser model.
|
195
|
+
|
196
|
+
EXAMPLES
|
197
|
+
anystyle check training-data.xml
|
198
|
+
|
199
|
+
Checks all references in the XML file and prints a report to STDOUT.
|
200
|
+
}.lstrip
|
201
|
+
|
202
|
+
arg :input
|
203
|
+
command :check do |cmd|
|
204
|
+
cmd.action do |opts, params, args|
|
205
|
+
Commands::Check.new(opts).run(args, params)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
desc 'Create a new finder or parser model'
|
211
|
+
long_desc %{
|
212
|
+
This manual page documents the AnyStyle `train' command. AnyStyle `train'
|
213
|
+
creates a new finder or parser model based on the supplied training sets.
|
214
|
+
|
215
|
+
The input argument can be a XML document, or a folder containing multiple
|
216
|
+
TTX documents.
|
217
|
+
|
218
|
+
EXAMPLES
|
219
|
+
anystyle train data.xml my-model.mod
|
220
|
+
|
221
|
+
Creates a new parser model based on the XML training set and saves it
|
222
|
+
as `my-model.mod'. To use your model use the global `--finder-model'
|
223
|
+
or `--parser-model' flags.
|
224
|
+
}.lstrip
|
225
|
+
|
226
|
+
arg :input
|
227
|
+
arg :output, :optional
|
228
|
+
command :train do |cmd|
|
229
|
+
cmd.action do |opts, params, args|
|
230
|
+
Commands::Train.new(opts).run(args, params)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
143
234
|
desc 'Print license information'
|
144
235
|
command :license do |cmd|
|
145
236
|
cmd.action do
|
data/lib/anystyle/cli.rb
CHANGED
@@ -28,8 +28,11 @@ module AnyStyle
|
|
28
28
|
options[:format].each(&block)
|
29
29
|
end
|
30
30
|
|
31
|
-
def find(input,
|
32
|
-
AnyStyle.find(input,
|
31
|
+
def find(input, opts = {})
|
32
|
+
AnyStyle.find(input,
|
33
|
+
format: :wapiti,
|
34
|
+
layout: opts[:layout],
|
35
|
+
crop: opts[:crop].nil? ? nil : opts[:crop].map(&:to_i))
|
33
36
|
end
|
34
37
|
|
35
38
|
def parse(input)
|
@@ -67,7 +70,12 @@ module AnyStyle
|
|
67
70
|
end
|
68
71
|
|
69
72
|
def set_output_folder(path)
|
70
|
-
|
73
|
+
case path
|
74
|
+
when nil, '-'
|
75
|
+
options[:stdout] = true
|
76
|
+
else
|
77
|
+
@output_folder = Pathname.new(path).expand_path
|
78
|
+
end
|
71
79
|
ensure
|
72
80
|
unless @output_folder.nil?
|
73
81
|
if @output_folder.exist?
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module CLI
|
3
|
+
module Commands
|
4
|
+
class Check < Base
|
5
|
+
def run(args, params)
|
6
|
+
walk args[0] do |path|
|
7
|
+
print 'Checking %.25s' % "#{path.basename}....................."
|
8
|
+
start = Time.now
|
9
|
+
stats = check path
|
10
|
+
report stats, Time.now - start
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def check(path)
|
15
|
+
case path.extname
|
16
|
+
when '.ttx'
|
17
|
+
AnyStyle.finder.check path.to_s.untaint
|
18
|
+
when '.xml'
|
19
|
+
AnyStyle.parser.check path.to_s.untaint
|
20
|
+
else
|
21
|
+
raise ArgumentError, "cannot check untagged input: #{path}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def report(stats, time)
|
26
|
+
if stats[:token][:errors] == 0
|
27
|
+
puts ' ✓ %2ds' % time
|
28
|
+
else
|
29
|
+
puts '%4d seq %6.2f%% %6d tok %5.2f%% %2ds' % [
|
30
|
+
stats[:sequence][:errors],
|
31
|
+
stats[:sequence][:rate],
|
32
|
+
stats[:token][:errors],
|
33
|
+
stats[:token][:rate],
|
34
|
+
time
|
35
|
+
]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -6,8 +6,8 @@ module AnyStyle
|
|
6
6
|
set_output_folder args[1]
|
7
7
|
walk args[0] do |path, base_path|
|
8
8
|
say "Analyzing #{path.relative_path_from(base_path)} ..."
|
9
|
-
doc = find(path.to_s.untaint,
|
10
|
-
ref = doc[0].references
|
9
|
+
doc = find(path.to_s.untaint, params)
|
10
|
+
ref = doc[0].references(normalize_blocks: !params[:solo])
|
11
11
|
|
12
12
|
if ref.length == 0
|
13
13
|
say "no references found."
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module CLI
|
3
|
+
module Commands
|
4
|
+
class Train < Base
|
5
|
+
def run(args, params)
|
6
|
+
check_no_overwrite! args[1]
|
7
|
+
|
8
|
+
Wapiti.debug!
|
9
|
+
model = train(args[0])
|
10
|
+
|
11
|
+
if args[1].nil?
|
12
|
+
model.save
|
13
|
+
else
|
14
|
+
model.save File.expand_path(args[1]).untaint
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def train(path)
|
19
|
+
case
|
20
|
+
when File.extname(path) == '.xml'
|
21
|
+
AnyStyle.parser.train path.to_s.untaint
|
22
|
+
AnyStyle.parser.model
|
23
|
+
when File.directory?(path)
|
24
|
+
AnyStyle.finder.train path.to_s.untaint
|
25
|
+
AnyStyle.finder.model
|
26
|
+
else
|
27
|
+
raise ArgumentError, "cannot train input: #{path}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def check_no_overwrite!(path)
|
32
|
+
if !overwrite? && (path.nil? || File.exist?(path))
|
33
|
+
raise RuntimeError,
|
34
|
+
"file exists, use --overwrite to force saving: #{path}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
data/lib/anystyle/cli/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: anystyle
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
19
|
+
version: '1.2'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '1.
|
26
|
+
version: '1.2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bibtex-ruby
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -65,8 +65,10 @@ files:
|
|
65
65
|
- bin/anystyle
|
66
66
|
- lib/anystyle/cli.rb
|
67
67
|
- lib/anystyle/cli/commands/base.rb
|
68
|
+
- lib/anystyle/cli/commands/check.rb
|
68
69
|
- lib/anystyle/cli/commands/find.rb
|
69
70
|
- lib/anystyle/cli/commands/parse.rb
|
71
|
+
- lib/anystyle/cli/commands/train.rb
|
70
72
|
- lib/anystyle/cli/version.rb
|
71
73
|
homepage: http://anystyle.io
|
72
74
|
licenses:
|
@@ -88,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
90
|
version: '0'
|
89
91
|
requirements: []
|
90
92
|
rubyforge_project:
|
91
|
-
rubygems_version: 2.
|
93
|
+
rubygems_version: 2.6.13
|
92
94
|
signing_key:
|
93
95
|
specification_version: 4
|
94
96
|
summary: AnyStyle CLI
|