udise_school_report_reader 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +20 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +45 -0
  5. data/lib/udise_school_report_reader/activities_data_reader.rb +58 -0
  6. data/lib/udise_school_report_reader/anganwadi_data_reader.rb +22 -0
  7. data/lib/udise_school_report_reader/basic_info_data_reader.rb +29 -0
  8. data/lib/udise_school_report_reader/block_rectangle_combiner.rb +115 -0
  9. data/lib/udise_school_report_reader/building_data_reader.rb +36 -0
  10. data/lib/udise_school_report_reader/characteristics_reader.rb +28 -0
  11. data/lib/udise_school_report_reader/csv_writer.rb +75 -0
  12. data/lib/udise_school_report_reader/data_reader_base.rb +86 -0
  13. data/lib/udise_school_report_reader/digital_facilities_data_reader.rb +42 -0
  14. data/lib/udise_school_report_reader/enrollment_data_reader.rb +136 -0
  15. data/lib/udise_school_report_reader/enrollment_html_writer.rb +81 -0
  16. data/lib/udise_school_report_reader/enrollment_yaml_writer.rb +62 -0
  17. data/lib/udise_school_report_reader/ews_data_reader.rb +118 -0
  18. data/lib/udise_school_report_reader/ews_html_writer.rb +63 -0
  19. data/lib/udise_school_report_reader/ews_yaml_writer.rb +31 -0
  20. data/lib/udise_school_report_reader/location_data_reader.rb +47 -0
  21. data/lib/udise_school_report_reader/official_data_reader.rb +40 -0
  22. data/lib/udise_school_report_reader/pdf_block_extractor.rb +49 -0
  23. data/lib/udise_school_report_reader/pdf_content_compressor.rb +36 -0
  24. data/lib/udise_school_report_reader/pdf_rectangle_extractor.rb +53 -0
  25. data/lib/udise_school_report_reader/rooms_data_reader.rb +36 -0
  26. data/lib/udise_school_report_reader/rte_data_reader.rb +118 -0
  27. data/lib/udise_school_report_reader/rte_html_writer.rb +63 -0
  28. data/lib/udise_school_report_reader/rte_yaml_writer.rb +61 -0
  29. data/lib/udise_school_report_reader/sanitation_data_reader.rb +56 -0
  30. data/lib/udise_school_report_reader/school_report_parser.rb +295 -0
  31. data/lib/udise_school_report_reader/teacher_data_reader.rb +204 -0
  32. data/lib/udise_school_report_reader/version.rb +3 -0
  33. data/lib/udise_school_report_reader.rb +41 -0
  34. data/test/school_report_parser_test.rb +62 -0
  35. metadata +165 -0
@@ -0,0 +1,53 @@
1
+ class PDFRectangleExtractor
2
+ def self.extract_rectangles(reader)
3
+ raise ArgumentError, "PDF reader cannot be nil" if reader.nil?
4
+
5
+ rectangles = []
6
+ current_color = '0 G' # Default stroke color (black)
7
+ current_fill_color = '1 1 1 rg' # Default fill color (white)
8
+ current_line_width = 1.0 # Default line width
9
+
10
+ reader.pages.each_with_index do |page, index|
11
+ page_number = index + 1
12
+
13
+ page.raw_content.each_line do |line|
14
+ # Track stroke color changes
15
+ if line.match?(/[\d.]+ [\d.]+ [\d.]+ RG/) || line.match?(/[\d.]+ G/)
16
+ current_color = line.strip
17
+ end
18
+
19
+ # Track fill color changes
20
+ if line.match?(/[\d.]+ [\d.]+ [\d.]+ rg/) || line.match?(/[\d.]+ g/)
21
+ current_fill_color = line.strip
22
+ end
23
+
24
+ # Track line width changes
25
+ if line.match?(/[\d.]+\s+w/)
26
+ if match = line.match(/(\d+\.?\d*)\s+w/)
27
+ current_line_width = match[1].to_f
28
+ end
29
+ end
30
+
31
+ # Look for rectangles (table cells)
32
+ if line.match?(/(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+re/)
33
+ matches = line.match(/(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+re/)
34
+ x, y, width, height = matches[1..4].map(&:to_f)
35
+
36
+ # Store the rectangle with its properties
37
+ rectangles << {
38
+ page: page_number,
39
+ x: x,
40
+ y: y,
41
+ width: width,
42
+ height: height,
43
+ stroke_color: current_color,
44
+ fill_color: current_fill_color,
45
+ line_width: current_line_width
46
+ }
47
+ end
48
+ end
49
+ end
50
+
51
+ rectangles
52
+ end
53
+ end
@@ -0,0 +1,36 @@
1
+ require_relative 'data_reader_base'
2
+
3
+ class RoomsDataReader
4
+ include DataReaderBase
5
+
6
+ FIELD_MAPPINGS = {
7
+ 'In Good Condition' => {
8
+ key_path: ['rooms', 'classrooms', 'good_condition'],
9
+ value_type: :integer,
10
+ end_pattern: /Needs Minor/
11
+ },
12
+ 'Needs Minor Repair' => {
13
+ key_path: ['rooms', 'classrooms', 'needs_minor_repair'],
14
+ value_type: :integer,
15
+ end_pattern: /Needs Major/
16
+ },
17
+ 'Needs Major Repair' => {
18
+ key_path: ['rooms', 'classrooms', 'needs_major_repair'],
19
+ value_type: :integer,
20
+ end_pattern: /Other Rooms/
21
+ },
22
+ 'Other Rooms' => {
23
+ key_path: ['rooms', 'other'],
24
+ value_type: :integer,
25
+ end_pattern: /Library/
26
+ },
27
+ 'Library Availability' => {
28
+ key_path: ['rooms', 'library'],
29
+ end_pattern: /Solar/
30
+ },
31
+ 'Separate Room for HM' => {
32
+ key_path: ['rooms', 'hm'],
33
+ end_pattern: /Drinking/
34
+ }
35
+ }
36
+ end
@@ -0,0 +1,118 @@
1
+ class RteDataReader
2
+ GRADES = [
3
+ 'Pre-Pri.', 'Class I', 'Class II', 'Class III', 'Class IV', 'Class V',
4
+ 'Class VI', 'Class VII', 'Class VIII'
5
+ ]
6
+
7
+ def self.read(csv_path) = new(csv_path).read
8
+
9
+ def initialize(csv_path)
10
+ @csv_path = csv_path
11
+ @rows = Hash.new { |h, k| h[k] = [] }
12
+
13
+ # Group cells by rect_y and rect_x
14
+ CSV.foreach(@csv_path, headers: true) do |cell|
15
+ next unless cell['page'] == '1'
16
+
17
+ rect_y = cell['rect_y'].to_f
18
+ @rows[rect_y] << cell
19
+ end
20
+
21
+ # Find the title row
22
+ @title_row = @rows.find { |_, cells| cells.any? { |cell| cell&.dig('text')&.include?('Total no. of Students Enrolled Under Section 12 of the RTE Act In Private Unaided and Specified Category Schools') } }
23
+
24
+ title_y = @title_row&.first
25
+ return unless title_y
26
+
27
+ # Get all rows below title in descending order
28
+ rows_after_title = @rows.select { |y, _| y < title_y.to_f }
29
+ .sort_by(&:first)
30
+ .reverse
31
+
32
+ # Get the next 3 rows after title
33
+ return unless rows_after_title.size >= 3
34
+
35
+ @grades_row = rows_after_title[0].last
36
+ @bg_row = rows_after_title[1].last
37
+ @values_row = rows_after_title[2].last
38
+
39
+ # Sort cells within each row by x coordinate
40
+ [@grades_row, @bg_row].each do |row|
41
+ next unless row
42
+ row.sort_by! { |cell| cell['text_x'].to_f }
43
+ end
44
+
45
+ # For values row, ensure we have a value for each B/G pair
46
+ if @values_row && @bg_row
47
+ sorted_values = []
48
+ @bg_row.each_slice(2) do |b, g|
49
+ b_x = b['text_x'].to_f
50
+ g_x = g['text_x'].to_f
51
+
52
+ # Find or create value for boys
53
+ b_val = @values_row.find { |cell| (cell['text_x'].to_f - b_x).abs < 10.0 }
54
+ b_val ||= { 'text' => '-', 'text_x' => b_x }
55
+ sorted_values << b_val
56
+
57
+ # Find or create value for girls
58
+ g_val = @values_row.find { |cell| (cell['text_x'].to_f - g_x).abs < 10.0 }
59
+ g_val ||= { 'text' => '-', 'text_x' => g_x }
60
+ sorted_values << g_val
61
+ end
62
+ @values_row = sorted_values
63
+ end
64
+
65
+ # Normalize empty values to "-"
66
+ @values_row&.each { |cell| cell['text'] = '-' if cell['text'].strip.empty? }
67
+
68
+ # Ensure we have all grades
69
+ found_grades = @grades_row.map { |cell| cell['text'] }
70
+ missing_grades = GRADES - found_grades
71
+ if missing_grades.any?
72
+ # Removed puts statement
73
+ end
74
+ end
75
+
76
+ def read
77
+ return nil unless @grades_row && @bg_row && @values_row
78
+
79
+ # Group B,G pairs, ensuring we have complete pairs
80
+ bg_pairs = {}
81
+ @bg_row.each_slice(2) do |pair|
82
+ next unless pair.size == 2 && pair[0] && pair[1] # Skip incomplete pairs
83
+ b, g = pair
84
+ x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
85
+ bg_pairs[x_mid] = [b, g]
86
+ end
87
+
88
+ # Match numbers to pairs
89
+ {
90
+ grade_rows: @grades_row,
91
+ bg_pairs: bg_pairs,
92
+ rte_numbers: match_numbers_to_pairs(@values_row, bg_pairs),
93
+ }
94
+ end
95
+
96
+ private
97
+ def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
98
+ numbers = {}
99
+ remaining = remaining_numbers.dup
100
+
101
+ bg_pairs.each do |x_mid, bg_pair|
102
+ next unless bg_pair && bg_pair.size == 2 # Skip invalid pairs
103
+ b_x = bg_pair[0]['text_x'].to_f
104
+ g_x = bg_pair[1]['text_x'].to_f
105
+
106
+ # Find numbers closest to B and G positions
107
+ b_num = remaining.find { |cell| (cell['text_x'].to_f - b_x).abs < threshold }
108
+ remaining.delete(b_num) if b_num
109
+
110
+ g_num = remaining.find { |cell| (cell['text_x'].to_f - g_x).abs < threshold }
111
+ remaining.delete(g_num) if g_num
112
+
113
+ numbers[x_mid] = [b_num, g_num]
114
+ end
115
+
116
+ numbers
117
+ end
118
+ end
@@ -0,0 +1,63 @@
1
+ class RteHtmlWriter
2
+ def self.generate_html(data, html_path)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = [
9
+ ['EWS', data[:ews_numbers] || {}],
10
+ ]
11
+
12
+ # Generate table rows for all categories
13
+ table_rows = categories.map do |category, numbers|
14
+ cells = bg_pairs.map do |x_mid, _|
15
+ nums = numbers[x_mid.to_s] || numbers[x_mid] || []
16
+ b_num = nums&.first
17
+ g_num = nums&.last
18
+ "<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
19
+ end.join
20
+
21
+ " <tr>\n" \
22
+ " <td class=\"category\">#{category}</td>\n" \
23
+ " #{cells}\n" \
24
+ " </tr>"
25
+ end.join("\n")
26
+
27
+ # Generate grade headers
28
+ grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
29
+ bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
30
+
31
+ html_content = <<~HTML
32
+ <!DOCTYPE html>
33
+ <html>
34
+ <head>
35
+ <title>Enrollment Table</title>
36
+ <style>
37
+ table { border-collapse: collapse; margin-top: 20px; width: 100%; }
38
+ th, td { border: 1px solid black; padding: 8px; text-align: center; }
39
+ .header { font-weight: bold; background-color: #f0f0f0; }
40
+ .grade { font-weight: bold; background-color: #e0e0e0; }
41
+ .bg-pair { background-color: #f8f8f8; }
42
+ .category { font-weight: bold; text-align: left; }
43
+ </style>
44
+ </head>
45
+ <body>
46
+ <h2>Enrolment (By Social Category)</h2>
47
+ <table>
48
+ <tr class="grade">
49
+ <th rowspan="2">Category</th>
50
+ #{grade_headers}
51
+ </tr>
52
+ <tr class="bg-pair">
53
+ #{bg_headers}
54
+ </tr>
55
+ #{table_rows}
56
+ </table>
57
+ </body>
58
+ </html>
59
+ HTML
60
+
61
+ File.write(html_path, html_content)
62
+ end
63
+ end
@@ -0,0 +1,61 @@
1
+ class RteYamlWriter
2
+ GRADE_MAPPING = {
3
+ 'Pre-Pri.' => 'pre-pri.',
4
+ 'Class I' => 'class_i',
5
+ 'Class II' => 'class_ii',
6
+ 'Class III' => 'class_iii',
7
+ 'Class IV' => 'class_iv',
8
+ 'Class V' => 'class_v',
9
+ 'Class VI' => 'class_vi',
10
+ 'Class VII' => 'class_vii',
11
+ 'Class VIII' => 'class_viii',
12
+ 'Class IX' => 'class_ix',
13
+ 'Class X' => 'class_x',
14
+ 'Class XI' => 'class_xi',
15
+ 'Class XII' => 'class_xii'
16
+ }
17
+
18
+ def self.format_yaml(data)
19
+ return unless data
20
+
21
+ rte_data = { 'rte' => {} }
22
+
23
+ # Get grade names and their indices
24
+ grades = data[:grade_rows].map { |row| row['text'] }
25
+ grade_indices = {}
26
+ grades.each_with_index do |grade, idx|
27
+ grade_indices[idx] = GRADE_MAPPING[grade] || grade.downcase.gsub(/\s+/, '_')
28
+ end
29
+
30
+ # Initialize structure for each grade
31
+ grade_indices.values.each do |grade_key|
32
+ rte_data['rte'][grade_key] = {
33
+ 'boys' => 0,
34
+ 'girls' => 0
35
+ }
36
+ end
37
+
38
+ # Fill in values
39
+ data[:rte_numbers].each do |x_mid, pair|
40
+ next unless pair && pair.size == 2
41
+
42
+ # Find corresponding grade index based on x position
43
+ grade_idx = grade_indices.keys.find do |idx|
44
+ x_start = data[:grade_rows][idx]['rect_x'].to_f
45
+ x_end = x_start + data[:grade_rows][idx]['rect_width'].to_f
46
+ x_mid >= x_start && x_mid <= x_end
47
+ end
48
+
49
+ next unless grade_idx && grade_indices[grade_idx]
50
+
51
+ grade_key = grade_indices[grade_idx]
52
+ boys_val = pair[0]&.dig('text')
53
+ girls_val = pair[1]&.dig('text')
54
+
55
+ rte_data['rte'][grade_key]['boys'] = boys_val == '-' ? 0 : boys_val.to_i
56
+ rte_data['rte'][grade_key]['girls'] = girls_val == '-' ? 0 : girls_val.to_i
57
+ end
58
+
59
+ rte_data
60
+ end
61
+ end
@@ -0,0 +1,56 @@
1
+ require_relative 'data_reader_base'
2
+
3
+ class SanitationDataReader
4
+ include DataReaderBase
5
+
6
+ FIELD_MAPPINGS = {
7
+ 'Handwash Near Toilet' => {
8
+ key_path: ['sanitation', 'handwash', 'near_toilet'],
9
+ end_pattern: /Handwash Facility/
10
+ },
11
+ 'Handwash Facility for Meal' => {
12
+ key_path: ['sanitation', 'handwash', 'for_meal'],
13
+ end_pattern: /Total Class/
14
+ },
15
+ 'Toilets' => {
16
+ key_path: ['sanitation', 'toilets'],
17
+ is_table: true,
18
+ table_config: {
19
+ sections: [
20
+ {
21
+ trigger: /Total.*CWSN/,
22
+ offset: 1,
23
+ fields: [
24
+ { key: ['boys', 'total'], value_type: :integer },
25
+ { key: ['girls', 'total'], value_type: :integer }
26
+ ]
27
+ },
28
+ {
29
+ trigger: "Functional",
30
+ offset: 1,
31
+ fields: [
32
+ { key: ['boys', 'functional'], value_type: :integer },
33
+ { key: ['girls', 'functional'], value_type: :integer }
34
+ ]
35
+ },
36
+ {
37
+ trigger: /CWSN Friendly/,
38
+ offset: 1,
39
+ fields: [
40
+ { key: ['boys', 'cwsn'], value_type: :integer },
41
+ { key: ['girls', 'cwsn'], value_type: :integer }
42
+ ]
43
+ },
44
+ {
45
+ trigger: "Urinal",
46
+ offset: 1,
47
+ fields: [
48
+ { key: ['boys', 'urinals'], value_type: :integer },
49
+ { key: ['girls', 'urinals'], value_type: :integer }
50
+ ]
51
+ }
52
+ ]
53
+ }
54
+ }
55
+ }
56
+ end
@@ -0,0 +1,295 @@
1
+ require 'pdf-reader'
2
+ require 'yaml'
3
+ require 'csv'
4
+ require 'fileutils'
5
+ require 'tempfile'
6
+
7
+ module UdiseSchoolReportReader
8
+ class SchoolReportParser
9
+ def self.extract_to_text(pdf_path, output_dir = nil, write_files = false)
10
+ raise ArgumentError, "PDF file not found" unless File.exist?(pdf_path)
11
+
12
+ # Extract all data first
13
+ extracted_data = extract_data(pdf_path)
14
+
15
+ # Write files if requested
16
+ write_output_files(pdf_path, extracted_data, output_dir) if write_files
17
+
18
+ # Return the YAML data
19
+ extracted_data[:yaml_data]
20
+ end
21
+
22
+ private
23
+
24
+ def self.extract_data(pdf_path)
25
+ reader = PDF::Reader.new(pdf_path)
26
+
27
+ # Extract raw content
28
+ content = reader.pages.map(&:raw_content).join("\n")
29
+ compressed_content = PDFContentCompressor.compress(content)
30
+
31
+ # Extract blocks and rectangles
32
+ blocks = PDFBlockExtractor.extract_blocks(reader)
33
+ rectangles = PDFRectangleExtractor.extract_rectangles(reader)
34
+ combined_data = BlockRectangleCombiner.combine(blocks, rectangles)
35
+
36
+ # Extract YAML data
37
+ yaml_data = extract_data_points(compressed_content)
38
+
39
+ # Create temporary file for combined data
40
+ temp_file = Tempfile.new(['combined', '.csv'])
41
+ begin
42
+ CSVWriter.write_combined(combined_data, temp_file.path)
43
+
44
+ # Extract table data using the temp file
45
+ enrollment_data = EnrollmentDataReader.read(temp_file.path)
46
+ ews_data = EwsDataReader.read(temp_file.path)
47
+ rte_data = RteDataReader.read(temp_file.path)
48
+
49
+ # Format table data for YAML
50
+ yaml_data['enrollment_data'] = EnrollmentYamlWriter.format_yaml(enrollment_data) if enrollment_data
51
+ yaml_data['ews_data'] = EwsYamlWriter.format_yaml(ews_data)
52
+ yaml_data['rte_data'] = RteYamlWriter.format_yaml(rte_data)
53
+
54
+ {
55
+ content: content,
56
+ compressed_content: compressed_content,
57
+ blocks: blocks,
58
+ rectangles: rectangles,
59
+ combined_data: combined_data,
60
+ enrollment_data: enrollment_data,
61
+ ews_data: ews_data,
62
+ rte_data: rte_data,
63
+ yaml_data: yaml_data
64
+ }
65
+ ensure
66
+ temp_file.close
67
+ temp_file.unlink
68
+ end
69
+ end
70
+
71
+ def self.write_output_files(pdf_path, data, output_dir)
72
+ paths = OutputPaths.new(pdf_path, output_dir)
73
+
74
+ # Write text files
75
+ File.write(paths.txt, data[:content])
76
+ File.write(paths.compressed_txt, data[:compressed_content])
77
+
78
+ # Write CSV files
79
+ CSVWriter.write_blocks(data[:blocks], paths.blocks_csv)
80
+ CSVWriter.write_rectangles(data[:rectangles], paths.rects_csv)
81
+ CSVWriter.write_combined(data[:combined_data], paths.combined_csv)
82
+
83
+ # Write HTML files
84
+ RteHtmlWriter.generate_html(data[:rte_data], paths.rte_html)
85
+ EnrollmentHtmlWriter.generate_html(data[:enrollment_data], paths.enrollment_html)
86
+ EwsHtmlWriter.generate_html(data[:ews_data], paths.ews_html)
87
+
88
+ # Write YAML file
89
+ File.write(paths.yaml, data[:yaml_data].to_yaml)
90
+ end
91
+
92
+ class OutputPaths
93
+ EXTENSIONS = {
94
+ txt: '.txt',
95
+ compressed_txt: '_compressed.txt',
96
+ blocks_csv: '_blocks.csv',
97
+ rects_csv: '_rects.csv',
98
+ combined_csv: '_combined.csv',
99
+ rte_html: '_rte.html',
100
+ enrollment_html: '_enrollment.html',
101
+ ews_html: '_ews.html',
102
+ yaml: '.yml'
103
+ }
104
+
105
+ def initialize(pdf_path, output_dir)
106
+ @pdf_path = pdf_path
107
+ @output_dir = output_dir
108
+ @base_name = File.basename(pdf_path, '.pdf')
109
+ end
110
+
111
+ EXTENSIONS.each do |name, ext|
112
+ define_method(name) do
113
+ if @output_dir
114
+ File.join(@output_dir, "#{@base_name}#{ext}")
115
+ elsif name == :yaml
116
+ File.join(File.dirname(@pdf_path), "#{@base_name}#{ext}")
117
+ else
118
+ tmp_dir = File.join(File.expand_path('.'), 'tmp')
119
+ FileUtils.mkdir_p(tmp_dir)
120
+ File.join(tmp_dir, "#{@base_name}#{ext}")
121
+ end
122
+ end
123
+ end
124
+ end
125
+
126
+ def self.extract_data_points(compressed_content)
127
+ lines = compressed_content.split("\n").map { |line| line.strip.gsub(/\\/, '') } # Remove escape characters
128
+
129
+ # Load template as base structure
130
+ data = YAML.load_file('template.yml')
131
+
132
+ # Extract data using readers
133
+ basic_info_data = BasicInfoDataReader.read(lines)
134
+ location_data = LocationDataReader.read(lines)
135
+ official_data = OfficialDataReader.read(lines)
136
+ characteristics_data = CharacteristicsReader.read(lines)
137
+ digital_facilities_data = DigitalFacilitiesDataReader.read(lines)
138
+ anganwadi_data = AnganwadiDataReader.read(lines)
139
+ building_data = BuildingDataReader.read(lines)
140
+ rooms_data = RoomsDataReader.read(lines)
141
+ teacher_data = TeacherDataReader.read(lines)
142
+ sanitation_data = SanitationDataReader.read(lines)
143
+ activities_data = ActivitiesDataReader.read(lines)
144
+
145
+ # Merge data from readers
146
+ data.merge!(basic_info_data) if basic_info_data
147
+ data.merge!(location_data) if location_data
148
+ data.merge!(official_data) if official_data
149
+ data.merge!(characteristics_data) if characteristics_data
150
+ data.merge!(digital_facilities_data) if digital_facilities_data
151
+ data.merge!(anganwadi_data) if anganwadi_data
152
+ data.merge!(building_data) if building_data
153
+ data.merge!(rooms_data) if rooms_data
154
+ data.merge!(teacher_data) if teacher_data
155
+ data.merge!(activities_data) if activities_data
156
+ data.merge!(sanitation_data) if sanitation_data
157
+
158
+ lines.each_with_index do |line, i|
159
+ next_line = lines[i + 1]&.strip
160
+
161
+ case line
162
+ # Basic Facilities
163
+ when "Drinking Water Available"
164
+ data['facilities']['basic']['water']['available'] = next_line if next_line && !next_line.match?(/Drinking Water Fun/)
165
+ when "Drinking Water Functional"
166
+ data['facilities']['basic']['water']['functional'] = next_line if next_line && !next_line.match?(/Rain/)
167
+ when "Rain Water Harvesting"
168
+ data['facilities']['basic']['water']['rain_water_harvesting'] = next_line if next_line && !next_line.match?(/Playground/)
169
+ when "Playground Available"
170
+ data['facilities']['basic']['safety']['playground'] = next_line if next_line && !next_line.match?(/Furniture/)
171
+ when "Electricity Availability"
172
+ data['facilities']['basic']['electricity']['available'] = next_line if next_line && !next_line.match?(/Solar/)
173
+ when "Solar Panel"
174
+ data['facilities']['basic']['electricity']['solar_panel'] = next_line if next_line && !next_line.match?(/Medical/)
175
+ when "Furniture Availability"
176
+ if next_line =~ /^\d+$/
177
+ data['infrastructure']['furniture']['count'] = next_line.to_i
178
+ end
179
+
180
+ # Academic
181
+ when /^Medium (\d)$/
182
+ medium_num = $1
183
+ if next_line && next_line =~ /^(\d+)-(.+)$/
184
+ code = $1
185
+ name = $2.strip
186
+ data['academic']['medium_of_instruction']["medium_#{medium_num}"] = {
187
+ 'code' => code,
188
+ 'name' => name
189
+ }
190
+ end
191
+
192
+ when "CCE"
193
+ if next_line
194
+ data['academic']['assessments']['cce']['implemented']['primary'] = next_line
195
+ data['academic']['assessments']['cce']['implemented']['upper_primary'] = lines[i + 2] if lines[i + 2]
196
+ data['academic']['assessments']['cce']['implemented']['secondary'] = lines[i + 3] if lines[i + 3]
197
+ data['academic']['assessments']['cce']['implemented']['higher_secondary'] = lines[i + 4] if lines[i + 4]
198
+ end
199
+
200
+ # Residential Info
201
+ when "Residential School"
202
+ if next_line && next_line =~ /^(\d+)\s*-\s*(.+)$/
203
+ code = $1
204
+ type = $2.strip
205
+ data['facilities']['residential']['details']['type'] = "#{code} - #{type}"
206
+ end
207
+ when "Residential Type"
208
+ data['facilities']['residential']['details']['category'] = next_line if next_line && !next_line.match?(/Minority/)
209
+ when "Minority School"
210
+ data['facilities']['residential']['details']['minority_school'] = next_line if next_line && !next_line.match?(/Approachable/)
211
+ when "Approachable By All Weather Road"
212
+ data['facilities']['basic']['safety']['all_weather_road'] = next_line if next_line && !next_line.match?(/Toilets/)
213
+
214
+ # Student Facilities
215
+ when /No\.of Students Received/
216
+ # Skip the header lines
217
+ i += 2 # Skip "Primary" and "Up.Primary" lines
218
+ when /Free text books/
219
+ if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
220
+ data['students']['facilities']['incentives']['free_textbooks']['primary'] = lines[i + 1].to_i
221
+ data['students']['facilities']['incentives']['free_textbooks']['upper_primary'] = lines[i + 2].to_i
222
+ end
223
+ when /Transport/
224
+ if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
225
+ data['students']['facilities']['general']['transport']['primary'] = lines[i + 1].to_i
226
+ data['students']['facilities']['general']['transport']['upper_primary'] = lines[i + 2].to_i
227
+ end
228
+ when /Free uniform/
229
+ if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
230
+ data['students']['facilities']['incentives']['free_uniform']['primary'] = lines[i + 1].to_i
231
+ data['students']['facilities']['incentives']['free_uniform']['upper_primary'] = lines[i + 2].to_i
232
+ end
233
+
234
+ # Committees
235
+ when "SMC Exists"
236
+ data['committees']['smc']['details']['exists'] = next_line if next_line && !next_line.match?(/SMC & SMDC/)
237
+ when "SMC & SMDC Same"
238
+ data['committees']['smc']['details']['same_as_smdc'] = next_line if next_line && !next_line.match?(/SMDC Con/)
239
+ when "SMDC Constituted"
240
+ data['committees']['smdc']['details']['constituted'] = next_line if next_line && !next_line.match?(/Text Books/)
241
+
242
+ # Grants
243
+ when "Grants Receipt"
244
+ if next_line =~ /^\d+\.?\d*$/
245
+ data['grants']['received']['amount'] = next_line.to_f
246
+ end
247
+ when "Grants Expenditure"
248
+ if next_line =~ /^\d+\.?\d*$/
249
+ data['grants']['expenditure']['amount'] = next_line.to_f
250
+ end
251
+
252
+ # Medical facilities
253
+ when "Medical checkups"
254
+ data['facilities']['medical']['checkups']['available'] = next_line if next_line
255
+
256
+ # Sports facilities
257
+ when "Sports Equipment"
258
+ data['academic']['sports']['equipment']['available'] = next_line if next_line
259
+ when "Physical Education Teacher"
260
+ data['academic']['sports']['instructors']['available'] = next_line if next_line
261
+
262
+ # Safety measures
263
+ when "Fire Extinguisher"
264
+ data['facilities']['safety']['fire']['equipment']['extinguisher'] = next_line if next_line
265
+ when "Emergency Exit"
266
+ data['facilities']['safety']['emergency']['exits']['available'] = next_line if next_line
267
+ when "Security Guard"
268
+ data['facilities']['safety']['security']['personnel']['guard'] = next_line if next_line
269
+
270
+ # Committee meetings
271
+ when "SMC Meetings Conducted"
272
+ data['committees']['smc']['details']['meetings']['count'] = next_line.to_i if next_line =~ /^\d+$/
273
+ when "SMDC Meetings Conducted"
274
+ data['committees']['smdc']['details']['meetings']['count'] = next_line.to_i if next_line =~ /^\d+$/
275
+
276
+ # Vocational courses
277
+ when "Vocational Courses"
278
+ data['academic']['vocational']['courses']['available'] = next_line if next_line
279
+ when "Vocational Trainer"
280
+ data['academic']['vocational']['trainers']['available'] = next_line if next_line
281
+ end
282
+ end
283
+
284
+ # Clean up empty sections
285
+ data.each do |_, section|
286
+ if section.is_a?(Hash)
287
+ section.reject! { |_, v| v.nil? || (v.is_a?(Hash) && v.empty?) }
288
+ end
289
+ end
290
+ data.reject! { |_, v| v.nil? || (v.is_a?(Hash) && v.empty?) }
291
+
292
+ data
293
+ end
294
+ end
295
+ end