easy_cols 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+ require_relative 'lib/easy_cols/version'
6
+
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task default: :spec
10
+
11
+ namespace :version do
12
+ desc 'Show current version'
13
+ task :current do
14
+ puts "Current version: #{EasyCols::VERSION}"
15
+ end
16
+
17
+ def bump_version(type)
18
+ version_file = File.join(__dir__, 'lib', 'easy_cols', 'version.rb')
19
+ content = File.read(version_file)
20
+
21
+ current_version = EasyCols::VERSION
22
+ major, minor, patch = current_version.split('.').map(&:to_i)
23
+
24
+ case type
25
+ when :major
26
+ major += 1
27
+ minor = 0
28
+ patch = 0
29
+ when :minor
30
+ minor += 1
31
+ patch = 0
32
+ when :patch
33
+ patch += 1
34
+ else
35
+ raise "Unknown version type: #{type}"
36
+ end
37
+
38
+ new_version = "#{major}.#{minor}.#{patch}"
39
+
40
+ # Update version file
41
+ new_content = content.sub(/VERSION = ['"]#{current_version}['"]/, "VERSION = '#{new_version}'")
42
+ File.write(version_file, new_content)
43
+
44
+ puts "Version bumped from #{current_version} to #{new_version}"
45
+ puts "Updated #{version_file}"
46
+ puts "\nDon't forget to:"
47
+ puts " git add #{version_file}"
48
+ puts " git commit -m 'Bump version to #{new_version}'"
49
+
50
+ new_version
51
+ end
52
+
53
+ desc 'Bump major version (x.0.0)'
54
+ task :major do
55
+ bump_version(:major)
56
+ end
57
+
58
+ desc 'Bump minor version (0.x.0)'
59
+ task :minor do
60
+ bump_version(:minor)
61
+ end
62
+
63
+ desc 'Bump patch version (0.0.x)'
64
+ task :patch do
65
+ bump_version(:patch)
66
+ end
67
+ end
68
+
69
+ desc 'Create and push release tag for current version'
70
+ task :release do
71
+ require 'open3'
72
+
73
+ version = EasyCols::VERSION
74
+ tag_name = "v#{version}"
75
+
76
+ # Check we're on main branch
77
+ current_branch, _ = Open3.capture2('git', 'rev-parse', '--abbrev-ref', 'HEAD')
78
+ current_branch = current_branch.strip
79
+ unless current_branch == 'main'
80
+ raise "Must be on main branch to release. Current branch: #{current_branch}"
81
+ end
82
+
83
+ # Check for uncommitted changes
84
+ status, _ = Open3.capture2('git', 'status', '--porcelain')
85
+ unless status.strip.empty?
86
+ raise "Working directory has uncommitted changes. Commit or stash them first."
87
+ end
88
+
89
+ # Check if tag already exists
90
+ tag_check, _ = Open3.capture2('git', 'tag', '-l', tag_name)
91
+ unless tag_check.strip.empty?
92
+ raise "Tag #{tag_name} already exists. Did you forget to bump the version?"
93
+ end
94
+
95
+ # Check if we're up to date with remote
96
+ Open3.capture2('git', 'fetch', 'origin')
97
+ local_commit, _ = Open3.capture2('git', 'rev-parse', 'HEAD')
98
+ remote_commit, _ = Open3.capture2('git', 'rev-parse', 'origin/main')
99
+ if local_commit.strip != remote_commit.strip
100
+ raise "Local main is not up to date with origin/main. Please pull or push first."
101
+ end
102
+
103
+ # Create and push tag
104
+ puts "Creating release tag #{tag_name}..."
105
+ system('git', 'tag', '-a', tag_name, '-m', "Release #{tag_name}") or raise "Failed to create tag"
106
+ puts "Pushing tag to origin..."
107
+ system('git', 'push', 'origin', tag_name) or raise "Failed to push tag"
108
+
109
+ puts "\n✓ Release tag #{tag_name} created and pushed!"
110
+ puts "CI will automatically create the GitHub release and publish to RubyGems."
111
+ end
112
+
data/TODO.md ADDED
@@ -0,0 +1,14 @@
1
+ # TODO
2
+
3
+ ## Future Enhancements
4
+
5
+ ### Streaming Pipeline Support
6
+ Implement a pure streaming pipeline to handle arbitrarily large files without loading the entire input into memory. This would enable processing files of any size by:
7
+
8
+ - Reading input line-by-line instead of loading everything into memory
9
+ - Detecting headers and separator lines incrementally (especially for table format)
10
+ - Outputting rows incrementally as they're processed
11
+ - Handling column selection with header-based selectors in a streaming context
12
+
13
+ This would trade some parsing flexibility (look-ahead/backtrack) for memory efficiency, which could be valuable for very large inputs.
14
+
data/USAGE.md ADDED
@@ -0,0 +1,78 @@
1
+ ec [FIELD | FIELD1..FIELD2 | FIELD1-FIELD2] ...
2
+
3
+ Ranges of fields can be specified with STARTFIELD..STOPFIELD or STARTFIELD-STOPFIELD.
4
+ FIELD can be a field index (an integer), or a field name.
5
+
6
+ The basic idea is to make it *easy* to fetch specific columns of STDIN (or a file).
7
+ There are several supported *formats* for parsing:
8
+
9
+ csv Comma-separated columns; fields can be quoted; there can be one header line
10
+ tsv Tab-separated columns; fields can be quoted; there can be one header line
11
+ tbl An ASCII table format, with a header, a separator line (using -+-), and data rows (using | separator)
12
+
13
+
14
+ Options:
15
+
16
+ --in=FORMAT # input format (default: auto)
17
+ --out=FORMAT # output format (default: same)
18
+ --delim=CHARS | -d CHARS # split fields by SEPARATOR
19
+ --pattern=PATTERN | -p PATTERN # split fields by PATTERN
20
+ --format FORMAT | -f FORMAT # assume input is in FORMAT (deprecated, use --in)
21
+ --quotes | -q # parse matching quotes before splitting
22
+ --headers[=NUM] | -h NUM # ignore header line(s), NUM=1 by default
23
+ --lines | -l # ignore horizontal lines ("---*", "___*")
24
+ --blanklines | -b # ignore blank lines
25
+ --comments=PATTERN | -c PATTERN # strip any comment prefix
26
+ --cprefix=PATTERN | -c PATTERN
27
+ --cblock=PATTERN | -
28
+ --start=[NUM | PAT] | -s NUM_PAT # ignore lines before NUM or PAT
29
+ --stop=[NUM | PAT] | -S NUM_PAT # stop at line NUM, or first line matching PAT
30
+ --no-quotes | -Q # do not parse out matched quotes
31
+ --no-headers[=NUM] | -H NUM # do not ignore header lines (NUM=1 by default)
32
+ --no-lines | -L # do not ignore horizontal lines
33
+ --no-blanklines | -B # do not ignore blank lines
34
+ --csv # parse input as CSV format (sets --in=csv)
35
+ --tsv # parse input as TSV format (sets --in=tsv)
36
+ --tbl # parse input as table format (sets --in=table)
37
+ --table # output as table format (sets --out=table with pipe separator)
38
+ --plain # parse input as plain format (sets --in=plain)
39
+ --LANGUAGE # set comments pattern according to the LANGUAGE
40
+
41
+ CHARS="\s" # default is whitespace, same as " \t\n\r"
42
+ CHARS=":" # separate fields with ':'
43
+ CHARS="," # separate fields with ','
44
+
45
+ Input Formats (--in=FORMAT):
46
+ --in=csv # Comma-separated values, fields can be quoted
47
+ --in=tsv # Tab-separated values
48
+ --in=table # ASCII table format with header, separator line, and data
49
+ --in=tbl # Alias for table
50
+ --in=plain # Whitespace-separated values
51
+ --in=auto # Auto-detect format (default)
52
+
53
+ Output Formats (--out=FORMAT):
54
+ --out=csv # Output as CSV (comma-separated, properly quoted)
55
+ --out=tsv # Output as TSV (tab-separated)
56
+ --out=table # Output as ASCII table (with column widths and separator lines)
57
+ --out=tbl # Alias for table
58
+ --out=plain # Output as whitespace-separated (aligned columns)
59
+ --out=same # Use same format as input (default)
60
+
61
+ Format Conversion Examples:
62
+ ec --in=csv --out=table data.csv # Convert CSV to table format
63
+ ec --in=table --out=csv data.txt # Convert table to CSV format
64
+ ec --out=table data.csv # Auto-detect input, convert to table
65
+ ec --in=tsv --out=csv data.tsv # Convert TSV to CSV
66
+
67
+ The comments pattern can be given explicitly, or can be inferred by the input
68
+ file type (if given), or by --LANGUAGE option, where LANGUAGE is one of: ruby,
69
+ c, go, elixir, python, java, scala, etc. Each language has a different style
70
+ for doing block comments, and this tool can be used to extract columns from
71
+ blocks of comment text anywhere in file, given the --start and --stop patterns,
72
+ along with the
73
+
74
+ Examples:
75
+ --ruby # => --comments="^\\s*# "
76
+ --js # => --comments="^\\s*//"
77
+ --python # => --comments="^\\s*# "
78
+
data/bin/easy_cols ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative '../lib/easy_cols'
5
+
6
+ EasyCols::CLI.new.run(ARGV)
7
+
data/bin/ec ADDED
@@ -0,0 +1 @@
1
+ easy_cols
data/cols.gemspec ADDED
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/easy_cols/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'easy_cols'
7
+ spec.version = EasyCols::VERSION
8
+ spec.authors = ['Alan K. Stebbens']
9
+ spec.email = ['aks@stebbens.org']
10
+
11
+ spec.summary = 'A powerful command-line tool for extracting and processing columns from structured text data'
12
+ spec.description = <<~DESC
13
+ EasyCols is a flexible command-line utility for extracting specific columns from
14
+ structured text data in various formats (CSV, TSV, table, plain text). It supports
15
+ sophisticated parsing options including quote handling, comment stripping, header
16
+ processing, and language-specific comment patterns. It can be used on both files and STDIN.
17
+ DESC
18
+ spec.homepage = 'https://github.com/aks/easy_cols'
19
+ spec.license = 'MIT'
20
+
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
23
+ end
24
+ spec.bindir = 'bin'
25
+ spec.executables = ['easy_cols', 'ec']
26
+ spec.require_paths = ['lib']
27
+
28
+ spec.required_ruby_version = '>= 3.2.0'
29
+
30
+ spec.add_dependency 'csv', '~> 3.0'
31
+ spec.add_dependency 'optparse', '~> 0.1'
32
+
33
+ spec.add_development_dependency 'fuubar', '~> 2.5'
34
+ spec.add_development_dependency 'rspec', '~> 3.12'
35
+ spec.add_development_dependency 'rubocop', '~> 1.50'
36
+ spec.add_development_dependency 'rubocop-rspec', '~> 2.20'
37
+ spec.add_development_dependency 'simplecov', '~> 0.22'
38
+ spec.add_development_dependency 'rake', '~> 13.0'
39
+
40
+ spec.metadata['rubygems_mfa_required'] = 'true'
41
+ spec.metadata['homepage_uri'] = spec.homepage
42
+ spec.metadata['source_code_uri'] = 'https://github.com/aks/easy_cols'
43
+ spec.metadata['changelog_uri'] = 'https://github.com/aks/easy_cols/blob/main/CHANGELOG.md'
44
+ spec.metadata['bug_tracker_uri'] = 'https://github.com/aks/easy_cols/issues'
45
+ end
@@ -0,0 +1,252 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'optparse'
4
+ require 'stringio'
5
+
6
+ module EasyCols
7
+ $PROG = File.basename($0)
8
+
9
+ class CLI
10
+ def initialize
11
+ @options = {}
12
+ @column_selectors = []
13
+ end
14
+
15
+ def run(argv)
16
+ parse_options(argv)
17
+ process_input
18
+ rescue Error => e
19
+ warn "Error: #{e.message}"
20
+ exit 1
21
+ rescue StandardError => e
22
+ warn "Unexpected error: #{e.message}"
23
+ exit 1
24
+ end
25
+
26
+ private
27
+
28
+ def parse_options(argv)
29
+ OptionParser.new do |opts|
30
+ opts.banner = "Usage: #{$PROG} [options] <file> [column_selectors...]"
31
+
32
+ opts.separator <<~HELP
33
+
34
+ Extract and display specific columns from structured text data.
35
+
36
+ Column selectors can be:
37
+ - Column index: 0, 1, 2, etc. (0-based)
38
+ - Column range: 0-5, 2-10, etc.
39
+ - Comma-separated indices: 0,2,5
40
+ - Header name: 'Name', 'Email', etc.
41
+
42
+ Examples:
43
+ #{$PROG} data.csv 0 1 2 # Show columns 0, 1, 2
44
+ #{$PROG} data.csv 'Name' 'Email' # Show Name and Email columns
45
+ #{$PROG} data.csv 0-5 # Show columns 0 through 5
46
+ #{$PROG} --format tsv data.tsv 0 1 # Parse as TSV
47
+ #{$PROG} --table data.txt 0 1 2 # Parse as table format
48
+ #{$PROG} - < data.csv # Read from STDIN
49
+
50
+ Options:
51
+ HELP
52
+
53
+ opts.on('--in=FORMAT', Parser::SUPPORTED_FORMATS,
54
+ "Input format (default: auto, formats: #{Parser::SUPPORTED_FORMATS.join(', ')})") do |format|
55
+ @options[:input_format] = format
56
+ end
57
+
58
+ opts.on('--out=FORMAT', Formatter::SUPPORTED_OUTPUT_FORMATS,
59
+ "Output format (default: same, formats: #{Formatter::SUPPORTED_OUTPUT_FORMATS.join(', ')})") do |format|
60
+ @options[:output_format] = format
61
+ end
62
+
63
+ # Legacy options for backward compatibility
64
+ opts.on('-f', '--format=FORMAT', Parser::SUPPORTED_FORMATS,
65
+ "Input format (deprecated, use --in)") do |format|
66
+ @options[:input_format] = format
67
+ end
68
+
69
+ opts.on('-d', '--delimiter=CHARS', 'Field delimiter') do |delim|
70
+ @options[:delimiter] = delim
71
+ end
72
+
73
+ opts.on('-D', '--output-delimiter=STR', 'Output separator (default: " , ")') do |str|
74
+ @options[:output_separator] = str
75
+ end
76
+
77
+ opts.on('-H', '--no-header', 'Do not output header row') do
78
+ @options[:no_header] = true
79
+ end
80
+
81
+ opts.on('--table', 'Use table format for output (sets output format to table)') do
82
+ # Only set output format, not input format
83
+ # Input format will be auto-detected unless explicitly set
84
+ @options[:output_format] = 'table'
85
+ @options[:output_separator] = ' | '
86
+ @options[:table_mode] = true
87
+ end
88
+
89
+ opts.on('--pipe', 'Use pipe separator (" | ")') do
90
+ @options[:output_separator] = ' | '
91
+ end
92
+
93
+ opts.on('--tab', 'Use tab separator') do
94
+ @options[:output_separator] = "\t"
95
+ end
96
+
97
+ opts.on('--comma', 'Use comma separator (",")') do
98
+ @options[:output_separator] = ','
99
+ end
100
+
101
+ # Convenience format options for input
102
+ opts.on('--csv', 'Parse input as CSV format') do
103
+ @options[:input_format] = 'csv'
104
+ end
105
+
106
+ opts.on('--tsv', 'Parse input as TSV format') do
107
+ @options[:input_format] = 'tsv'
108
+ end
109
+
110
+ opts.on('--tbl', 'Parse input as table format') do
111
+ @options[:input_format] = 'table'
112
+ end
113
+
114
+ opts.on('--plain', 'Parse input as plain (whitespace-separated) format') do
115
+ @options[:input_format] = 'plain'
116
+ end
117
+
118
+ opts.on('-v', '--verbose', 'Verbose output') do
119
+ @options[:verbose] = true
120
+ end
121
+
122
+ opts.on('-q', '--quiet', 'Quiet output') do
123
+ @options[:quiet] = true
124
+ end
125
+
126
+ opts.on('-c', '--count', 'Count columns instead of selecting') do
127
+ @options[:count_mode] = true
128
+ end
129
+
130
+ opts.on('-h', '--help', 'Show this help') do
131
+ puts opts
132
+ exit 0
133
+ end
134
+ end.parse!(argv)
135
+
136
+ @file_path = argv[0]
137
+ @column_selectors = parse_column_selectors(argv[1..]) if argv.length > 1
138
+ end
139
+
140
+ def parse_column_selectors(selectors)
141
+ selectors.map do |selector|
142
+ case selector
143
+ when /^\d+$/ # Single integer
144
+ selector.to_i
145
+ when /^\d+-\d+$/ # Range
146
+ start_idx, end_idx = selector.split('-').map(&:to_i)
147
+ (start_idx..end_idx).to_a
148
+ when /,/ # Comma-separated
149
+ selector.split(',').map(&:strip).map(&:to_i)
150
+ else # Header name
151
+ selector
152
+ end
153
+ end
154
+ end
155
+
156
+ def process_input
157
+ input_data = read_input
158
+
159
+ if @options[:count_mode]
160
+ count_columns(input_data)
161
+ else
162
+ select_columns(input_data)
163
+ end
164
+ end
165
+
166
+ def read_input
167
+ if @file_path == '-' || @file_path.nil?
168
+ $stdin.read
169
+ else
170
+ File.read(@file_path)
171
+ end
172
+ end
173
+
174
+ def select_columns(input_data)
175
+ # Determine input format
176
+ input_format = @options[:input_format] || 'auto'
177
+
178
+ # Parser only needs input/parsing options
179
+ parser_options = {
180
+ format: input_format,
181
+ delimiter: @options[:delimiter]
182
+ }.compact
183
+
184
+ parser = Parser.new(**parser_options)
185
+ data = parser.parse(input_data)
186
+
187
+ return if data.empty?
188
+
189
+ headers = data.first
190
+ selector = ColumnSelector.new(headers)
191
+
192
+ # If no selectors provided, default to all columns
193
+ selectors = if @column_selectors.empty?
194
+ (0...headers.length).to_a
195
+ else
196
+ @column_selectors
197
+ end
198
+
199
+ selected_indices = selector.select(selectors)
200
+
201
+ # Determine output format
202
+ # If 'same', use the detected/parsed input format
203
+ # However, if output_separator is explicitly set (not default), keep as 'same'
204
+ # to use format_default which respects the separator
205
+ actual_input_format = parser.detected_format || 'csv'
206
+ output_format = @options[:output_format] || 'same'
207
+
208
+ # If separator is explicitly set, use default format (not format-specific)
209
+ # Otherwise, convert 'same' to the actual input format
210
+ if output_format == 'same' && !@options[:output_separator]
211
+ output_format = actual_input_format
212
+ end
213
+
214
+ # Formatter needs output format and options
215
+ formatter_options = {
216
+ format: output_format,
217
+ separator: @options[:output_separator] || ' , ',
218
+ show_header: !@options[:no_header],
219
+ table_mode: @options[:table_mode] || (output_format == 'table' || output_format == 'tbl')
220
+ }
221
+
222
+ formatter = Formatter.new(formatter_options)
223
+ output = formatter.format(data, selected_indices)
224
+
225
+ puts output
226
+ end
227
+
228
+ def count_columns(input_data)
229
+ # Determine input format
230
+ input_format = @options[:input_format] || 'auto'
231
+
232
+ # Parser only needs input/parsing options
233
+ parser_options = {
234
+ format: input_format,
235
+ delimiter: @options[:delimiter]
236
+ }.compact
237
+
238
+ parser = Parser.new(**parser_options)
239
+ data = parser.parse(input_data)
240
+
241
+ return if data.empty?
242
+
243
+ headers = data.first
244
+ puts "Headers: #{headers.join(', ')}" unless @options[:quiet]
245
+ puts "Total columns: #{headers.length}"
246
+
247
+ data[1..].each_with_index do |row, index|
248
+ puts "Row #{index + 1}: #{row.length} columns" unless @options[:quiet]
249
+ end
250
+ end
251
+ end
252
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EasyCols
4
+ class ColumnSelector
5
+ def initialize(headers)
6
+ @headers = headers
7
+ end
8
+
9
+ def select(selectors)
10
+ indices = []
11
+
12
+ selectors.each do |selector|
13
+ result = case selector
14
+ when Integer then select_by_index(selector)
15
+ when Range then select_by_range(selector)
16
+ when Array then select_by_array(selector)
17
+ when String then select_by_name(selector)
18
+ else
19
+ raise SelectionError, "Invalid selector type: #{selector.class}"
20
+ end
21
+ indices.concat(result)
22
+ end
23
+
24
+ indices.uniq.sort
25
+ end
26
+
27
+ private
28
+
29
+ def select_by_index(index)
30
+ if index >= 0 && index < @headers.length
31
+ [index]
32
+ else
33
+ raise SelectionError, "Column index #{index} is out of range (0-#{@headers.length - 1})"
34
+ end
35
+ end
36
+
37
+ def select_by_range(range)
38
+ range.to_a.select { |idx| in_range?(idx) }
39
+ end
40
+
41
+ def select_by_array(array)
42
+ array.select { |idx| in_range?(idx) }
43
+ end
44
+
45
+ def in_range?(index)
46
+ in_range = index >= 0 && index < @headers.length
47
+ warn "Warning: Column index #{index} is out of range (0-#{@headers.length - 1})" unless in_range
48
+ in_range
49
+ end
50
+
51
+ def select_by_name(name)
52
+ header_idx = @headers.find_index(name)
53
+ if header_idx
54
+ [header_idx]
55
+ else
56
+ raise SelectionError, "Column '#{name}' not found. Available: #{@headers.join(', ')}"
57
+ end
58
+ end
59
+ end
60
+ end
61
+