anystyle-cli 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: '082b162dc4771fef9ba5ec4ee4fa623931ba0e34c8617a5b47ca7545bb14ce58'
4
- data.tar.gz: d035793b06dc909b570defd917995a782fe5cee3eafc9674aa3e696b64e73e98
2
+ SHA1:
3
+ metadata.gz: c6c4e6580d9495afd965e3401f030cee94925747
4
+ data.tar.gz: 56e3842f3b3db05ea1a7a5b17ab4f72b5ff07773
5
5
  SHA512:
6
- metadata.gz: a2836a714f4071917502ee641733d491c33710fa9b0e07f113ebf1c07c3bf2f2a97f9d5621312ca2eb942768a5b481597e03573ffa27cdc31ecba5d02351a7eb
7
- data.tar.gz: d51da26bceb9b7f2dd56ae1322145035c8669ad52ab5e0ac7bb11a6f9972ad3f2a757cfbb3e61272855d5605be4b3de40a44c1c2a644e1bea744e9bcd0fb319f
6
+ metadata.gz: 01b90daac809b52ac69e70550952596512ef8df0bdc5503faabf4a173e83d4bf045dd2b0fcf4ae0bb28b165b182885e6f997f5f6f764638ed7b19dda67a1a87d
7
+ data.tar.gz: be29161a8a6ba925f2f21a4d50ad1cb41c93d945977cc3cdf026b58383e3c1d0f979c44966cb32934c1ce2d24d0bbba7751e8b9f771927c9c43d2eb6b8385546
data/README.md CHANGED
@@ -10,22 +10,26 @@ anystyle --help
10
10
  anystyle [global options] command [command options] [arguments...]
11
11
 
12
12
  VERSION
13
- 1.0.0 (cli 1.0.0, data 1.2.0)
13
+ 1.1.0 (cli 1.0.2, data 1.2.0)
14
14
 
15
15
  GLOBAL OPTIONS
16
- --adapter=name - Set the dictionary adapter (default: ruby)
17
- -f, --format=name - Set the output format (default: ["json"])
18
- --help - Show this message
19
- --[no-]stdout - Print results directly to stdout
20
- --[no-]verbose - Print status messages to stderr
21
- --version - Display the program version
22
- -w, --[no-]overwrite - Allow overwriting existing files
16
+ -F, --finder-model=file - Set the finder model file (default: none)
17
+ -P, --parser-model=file - Set the parser model file (default: none)
18
+ --adapter=name - Set the dictionary adapter (default: ruby)
19
+ -f, --format=name - Set the output format (default: ["json"])
20
+ --help - Show this message
21
+ --[no-]stdout - Print results directly to stdout
22
+ --[no-]verbose - Print status messages to stderr
23
+ --version - Display the program version
24
+ -w, --[no-]overwrite - Allow overwriting existing files
23
25
 
24
26
  COMMANDS
27
+ check - Check tagged documents or references
25
28
  find - Find and extract references from text documents
26
29
  help - Shows a list of commands or help for one command
27
30
  license - Print license information
28
31
  parse - Parse and convert references
32
+ train - Create a new finder or parser model
29
33
 
30
34
  anystyle help find
31
35
  ------------------
@@ -57,7 +61,6 @@ anystyle help find
57
61
 
58
62
  Anlyzing PDF documents currently depends on `pdftotext' which must be
59
63
  installed separately.
60
-
61
64
  EXAMPLES
62
65
  anystyle -f csl,xml find thesis.pdf
63
66
 
@@ -70,6 +73,24 @@ anystyle help find
70
73
  if your document uses a multi-column layout) and save them in BibTeX in
71
74
  `./bib/thesis.bib'.
72
75
 
76
+ anystyle find --crop 72 thesis.pdf -
77
+
78
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
79
+ each page border and print the results to STDOUT.
80
+
81
+ anystyle find --crop 72,28 thesis.pdf -
82
+
83
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
84
+ each page's left and right border, approx. 1cm (28pt) from the top
85
+ and bottom.
86
+
87
+
88
+ COMMAND OPTIONS
89
+ -C, --crop=pt - Set cropping boundary for text extraction (default: none)
90
+ --[no-]layout - Use layout mode for PDF text extraction (default: enabled)
91
+ --[no-]solo - Include references outside of reference sections
92
+
93
+
73
94
  anystyle help parse
74
95
  -------------------
75
96
  COMMAND OPTIONS
@@ -111,6 +132,55 @@ anystyle help parse
111
132
 
112
133
  Extract references from `input.txt' and print them to STDOUT in CSL/JSON.
113
134
 
135
+
136
+ anystyle help check
137
+ -------------------
138
+ NAME
139
+ check - Check tagged documents or references
140
+
141
+ SYNOPSIS
142
+ anystyle [global options] check input
143
+
144
+ DESCRIPTION
145
+ This manual page documents the AnyStyle `check' command. AnyStyle `check'
146
+ analyzes tagged text documents or references.
147
+
148
+ The input argument can be a single TTX or XML document, or a folder
149
+ containing multiple documents.
150
+
151
+ AnyStyle `check' supports the following input formats:
152
+ ttx Tagged document format, used for training the finder model;
153
+ xml References only, XML, suitable for training the parser model.
154
+
155
+ EXAMPLES
156
+ anystyle check training-data.xml
157
+
158
+ Checks all references in the XML file and prints a report to STDOUT.
159
+
160
+
161
+ anystyle help train
162
+ -------------------
163
+ NAME
164
+ train - Create a new finder or parser model
165
+
166
+ SYNOPSIS
167
+ anystyle [global options] train input [output]
168
+
169
+ DESCRIPTION
170
+ This manual page documents the AnyStyle `train' command. AnyStyle `train'
171
+ creates a new finder or parser model based on the supplied training sets.
172
+
173
+ The input argument can be a XML document, or a folder containing multiple
174
+ TTX documents.
175
+
176
+ EXAMPLES
177
+ anystyle train data.xml my-model.mod
178
+
179
+ Creates a new parser model based on the XML training set and saves it
180
+ as `my-model.mod'. To use your model use the global `--finder-model'
181
+ or `--parser-model' flags.
182
+
183
+
114
184
  License
115
185
  -------
116
186
  Copyright 2011-2018 Sylvester Keil. All rights reserved.
data/bin/anystyle CHANGED
@@ -30,6 +30,14 @@ switch ['w', 'overwrite'],
30
30
  switch 'stdout',
31
31
  desc: 'Print results directly to stdout'
32
32
 
33
+ flag ['F', 'finder-model'],
34
+ arg_name: 'file',
35
+ desc: 'Set the finder model file'
36
+
37
+ flag ['P', 'parser-model'],
38
+ arg_name: 'file',
39
+ desc: 'Set the parser model file'
40
+
33
41
  flag 'adapter',
34
42
  default_value: 'ruby',
35
43
  arg_name: 'name',
@@ -46,6 +54,18 @@ flag ['f', 'format'],
46
54
 
47
55
  pre do |opts|
48
56
  AnyStyle::Dictionary.defaults[:adapter] = opts[:adapter]
57
+
58
+ unless opts[:'finder-model'].nil?
59
+ AnyStyle::Finder.defaults[:model] =
60
+ File.expand_path(opts[:'finder-model']).untaint
61
+ end
62
+
63
+ unless opts[:'parser-model'].nil?
64
+ AnyStyle::Parser.defaults[:model] =
65
+ File.expand_path(opts[:'parser-model']).untaint
66
+ end
67
+
68
+ AnyStyle
49
69
  end
50
70
 
51
71
 
@@ -84,6 +104,17 @@ EXAMPLES
84
104
  Extract references from `thesis.pdf' in `no-layout' mode (e.g., use this
85
105
  if your document uses a multi-column layout) and save them in BibTeX in
86
106
  `./bib/thesis.bib'.
107
+
108
+ anystyle find --crop 72 thesis.pdf -
109
+
110
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
111
+ each page border and print the results to STDOUT.
112
+
113
+ anystyle find --crop 72,28 thesis.pdf -
114
+
115
+ Extract references from `thesis.pdf' cropping away one inch (72pt) from
116
+ each page's left and right border, approx. 1cm (28pt) from the top
117
+ and bottom.
87
118
  }.lstrip
88
119
 
89
120
  arg :input
@@ -93,6 +124,16 @@ command :find do |cmd|
93
124
  default_value: true,
94
125
  desc: 'Use layout mode for PDF text extraction'
95
126
 
127
+ cmd.switch 'solo',
128
+ default_value: false,
129
+ desc: 'Include references outside of reference sections'
130
+
131
+ cmd.flag ['C', 'crop'],
132
+ arg_name: 'pt',
133
+ type: Array,
134
+ must_match: /\d+(,\d+)?|\d+,\d+(,-?\d+){2}/,
135
+ desc: 'Set cropping boundary for text extraction'
136
+
96
137
  cmd.action do |opts, params, args|
97
138
  Commands::Find.new(opts).run(args, params)
98
139
  end
@@ -140,6 +181,56 @@ command :parse do |cmd|
140
181
  end
141
182
  end
142
183
 
184
+ desc 'Check tagged documents or references'
185
+ long_desc %{
186
+ This manual page documents the AnyStyle `check' command. AnyStyle `check'
187
+ analyzes tagged text documents or references.
188
+
189
+ The input argument can be a single TTX or XML document, or a folder
190
+ containing multiple documents.
191
+
192
+ AnyStyle `check' supports the following input formats:
193
+ ttx Tagged document format, used for training the finder model;
194
+ xml References only, XML, suitable for training the parser model.
195
+
196
+ EXAMPLES
197
+ anystyle check training-data.xml
198
+
199
+ Checks all references in the XML file and prints a report to STDOUT.
200
+ }.lstrip
201
+
202
+ arg :input
203
+ command :check do |cmd|
204
+ cmd.action do |opts, params, args|
205
+ Commands::Check.new(opts).run(args, params)
206
+ end
207
+ end
208
+
209
+
210
+ desc 'Create a new finder or parser model'
211
+ long_desc %{
212
+ This manual page documents the AnyStyle `train' command. AnyStyle `train'
213
+ creates a new finder or parser model based on the supplied training sets.
214
+
215
+ The input argument can be a XML document, or a folder containing multiple
216
+ TTX documents.
217
+
218
+ EXAMPLES
219
+ anystyle train data.xml my-model.mod
220
+
221
+ Creates a new parser model based on the XML training set and saves it
222
+ as `my-model.mod'. To use your model use the global `--finder-model'
223
+ or `--parser-model' flags.
224
+ }.lstrip
225
+
226
+ arg :input
227
+ arg :output, :optional
228
+ command :train do |cmd|
229
+ cmd.action do |opts, params, args|
230
+ Commands::Train.new(opts).run(args, params)
231
+ end
232
+ end
233
+
143
234
  desc 'Print license information'
144
235
  command :license do |cmd|
145
236
  cmd.action do
data/lib/anystyle/cli.rb CHANGED
@@ -4,5 +4,7 @@ require 'anystyle'
4
4
 
5
5
  require 'anystyle/cli/version'
6
6
  require 'anystyle/cli/commands/base'
7
+ require 'anystyle/cli/commands/check'
7
8
  require 'anystyle/cli/commands/find'
8
9
  require 'anystyle/cli/commands/parse'
10
+ require 'anystyle/cli/commands/train'
@@ -28,8 +28,11 @@ module AnyStyle
28
28
  options[:format].each(&block)
29
29
  end
30
30
 
31
- def find(input, **opts)
32
- AnyStyle.find(input, format: :wapiti, **opts)
31
+ def find(input, opts = {})
32
+ AnyStyle.find(input,
33
+ format: :wapiti,
34
+ layout: opts[:layout],
35
+ crop: opts[:crop].nil? ? nil : opts[:crop].map(&:to_i))
33
36
  end
34
37
 
35
38
  def parse(input)
@@ -67,7 +70,12 @@ module AnyStyle
67
70
  end
68
71
 
69
72
  def set_output_folder(path)
70
- @output_folder = Pathname.new(path).expand_path unless path.nil?
73
+ case path
74
+ when nil, '-'
75
+ options[:stdout] = true
76
+ else
77
+ @output_folder = Pathname.new(path).expand_path
78
+ end
71
79
  ensure
72
80
  unless @output_folder.nil?
73
81
  if @output_folder.exist?
@@ -0,0 +1,41 @@
1
+ module AnyStyle
2
+ module CLI
3
+ module Commands
4
+ class Check < Base
5
+ def run(args, params)
6
+ walk args[0] do |path|
7
+ print 'Checking %.25s' % "#{path.basename}....................."
8
+ start = Time.now
9
+ stats = check path
10
+ report stats, Time.now - start
11
+ end
12
+ end
13
+
14
+ def check(path)
15
+ case path.extname
16
+ when '.ttx'
17
+ AnyStyle.finder.check path.to_s.untaint
18
+ when '.xml'
19
+ AnyStyle.parser.check path.to_s.untaint
20
+ else
21
+ raise ArgumentError, "cannot check untagged input: #{path}"
22
+ end
23
+ end
24
+
25
+ def report(stats, time)
26
+ if stats[:token][:errors] == 0
27
+ puts ' ✓ %2ds' % time
28
+ else
29
+ puts '%4d seq %6.2f%% %6d tok %5.2f%% %2ds' % [
30
+ stats[:sequence][:errors],
31
+ stats[:sequence][:rate],
32
+ stats[:token][:errors],
33
+ stats[:token][:rate],
34
+ time
35
+ ]
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -6,8 +6,8 @@ module AnyStyle
6
6
  set_output_folder args[1]
7
7
  walk args[0] do |path, base_path|
8
8
  say "Analyzing #{path.relative_path_from(base_path)} ..."
9
- doc = find(path.to_s.untaint, layout: params[:layout])
10
- ref = doc[0].references
9
+ doc = find(path.to_s.untaint, params)
10
+ ref = doc[0].references(normalize_blocks: !params[:solo])
11
11
 
12
12
  if ref.length == 0
13
13
  say "no references found."
@@ -0,0 +1,41 @@
1
+ module AnyStyle
2
+ module CLI
3
+ module Commands
4
+ class Train < Base
5
+ def run(args, params)
6
+ check_no_overwrite! args[1]
7
+
8
+ Wapiti.debug!
9
+ model = train(args[0])
10
+
11
+ if args[1].nil?
12
+ model.save
13
+ else
14
+ model.save File.expand_path(args[1]).untaint
15
+ end
16
+ end
17
+
18
+ def train(path)
19
+ case
20
+ when File.extname(path) == '.xml'
21
+ AnyStyle.parser.train path.to_s.untaint
22
+ AnyStyle.parser.model
23
+ when File.directory?(path)
24
+ AnyStyle.finder.train path.to_s.untaint
25
+ AnyStyle.finder.model
26
+ else
27
+ raise ArgumentError, "cannot train input: #{path}"
28
+ end
29
+ end
30
+
31
+ def check_no_overwrite!(path)
32
+ if !overwrite? && (path.nil? || File.exist?(path))
33
+ raise RuntimeError,
34
+ "file exists, use --overwrite to force saving: #{path}"
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+
@@ -1,5 +1,5 @@
1
1
  module AnyStyle
2
2
  module CLI
3
- VERSION = '1.0.2'.freeze
3
+ VERSION = '1.1.0'.freeze
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sylvester Keil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-07-11 00:00:00.000000000 Z
11
+ date: 2018-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: anystyle
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '1.2'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '1.2'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bibtex-ruby
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -65,8 +65,10 @@ files:
65
65
  - bin/anystyle
66
66
  - lib/anystyle/cli.rb
67
67
  - lib/anystyle/cli/commands/base.rb
68
+ - lib/anystyle/cli/commands/check.rb
68
69
  - lib/anystyle/cli/commands/find.rb
69
70
  - lib/anystyle/cli/commands/parse.rb
71
+ - lib/anystyle/cli/commands/train.rb
70
72
  - lib/anystyle/cli/version.rb
71
73
  homepage: http://anystyle.io
72
74
  licenses:
@@ -88,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
90
  version: '0'
89
91
  requirements: []
90
92
  rubyforge_project:
91
- rubygems_version: 2.7.4
93
+ rubygems_version: 2.6.13
92
94
  signing_key:
93
95
  specification_version: 4
94
96
  summary: AnyStyle CLI