udise_school_report_reader 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +20 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +45 -0
  5. data/lib/udise_school_report_reader/activities_data_reader.rb +58 -0
  6. data/lib/udise_school_report_reader/anganwadi_data_reader.rb +22 -0
  7. data/lib/udise_school_report_reader/basic_info_data_reader.rb +29 -0
  8. data/lib/udise_school_report_reader/block_rectangle_combiner.rb +115 -0
  9. data/lib/udise_school_report_reader/building_data_reader.rb +36 -0
  10. data/lib/udise_school_report_reader/characteristics_reader.rb +28 -0
  11. data/lib/udise_school_report_reader/csv_writer.rb +75 -0
  12. data/lib/udise_school_report_reader/data_reader_base.rb +86 -0
  13. data/lib/udise_school_report_reader/digital_facilities_data_reader.rb +42 -0
  14. data/lib/udise_school_report_reader/enrollment_data_reader.rb +136 -0
  15. data/lib/udise_school_report_reader/enrollment_html_writer.rb +81 -0
  16. data/lib/udise_school_report_reader/enrollment_yaml_writer.rb +62 -0
  17. data/lib/udise_school_report_reader/ews_data_reader.rb +118 -0
  18. data/lib/udise_school_report_reader/ews_html_writer.rb +63 -0
  19. data/lib/udise_school_report_reader/ews_yaml_writer.rb +31 -0
  20. data/lib/udise_school_report_reader/location_data_reader.rb +47 -0
  21. data/lib/udise_school_report_reader/official_data_reader.rb +40 -0
  22. data/lib/udise_school_report_reader/pdf_block_extractor.rb +49 -0
  23. data/lib/udise_school_report_reader/pdf_content_compressor.rb +36 -0
  24. data/lib/udise_school_report_reader/pdf_rectangle_extractor.rb +53 -0
  25. data/lib/udise_school_report_reader/rooms_data_reader.rb +36 -0
  26. data/lib/udise_school_report_reader/rte_data_reader.rb +118 -0
  27. data/lib/udise_school_report_reader/rte_html_writer.rb +63 -0
  28. data/lib/udise_school_report_reader/rte_yaml_writer.rb +61 -0
  29. data/lib/udise_school_report_reader/sanitation_data_reader.rb +56 -0
  30. data/lib/udise_school_report_reader/school_report_parser.rb +295 -0
  31. data/lib/udise_school_report_reader/teacher_data_reader.rb +204 -0
  32. data/lib/udise_school_report_reader/version.rb +3 -0
  33. data/lib/udise_school_report_reader.rb +41 -0
  34. data/test/school_report_parser_test.rb +62 -0
  35. metadata +165 -0
@@ -0,0 +1,53 @@
1
+ class PDFRectangleExtractor
2
+ def self.extract_rectangles(reader)
3
+ raise ArgumentError, "PDF reader cannot be nil" if reader.nil?
4
+
5
+ rectangles = []
6
+ current_color = '0 G' # Default stroke color (black)
7
+ current_fill_color = '1 1 1 rg' # Default fill color (white)
8
+ current_line_width = 1.0 # Default line width
9
+
10
+ reader.pages.each_with_index do |page, index|
11
+ page_number = index + 1
12
+
13
+ page.raw_content.each_line do |line|
14
+ # Track stroke color changes
15
+ if line.match?(/[\d.]+ [\d.]+ [\d.]+ RG/) || line.match?(/[\d.]+ G/)
16
+ current_color = line.strip
17
+ end
18
+
19
+ # Track fill color changes
20
+ if line.match?(/[\d.]+ [\d.]+ [\d.]+ rg/) || line.match?(/[\d.]+ g/)
21
+ current_fill_color = line.strip
22
+ end
23
+
24
+ # Track line width changes
25
+ if line.match?(/[\d.]+\s+w/)
26
+ if match = line.match(/(\d+\.?\d*)\s+w/)
27
+ current_line_width = match[1].to_f
28
+ end
29
+ end
30
+
31
+ # Look for rectangles (table cells)
32
+ if line.match?(/(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+re/)
33
+ matches = line.match(/(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+re/)
34
+ x, y, width, height = matches[1..4].map(&:to_f)
35
+
36
+ # Store the rectangle with its properties
37
+ rectangles << {
38
+ page: page_number,
39
+ x: x,
40
+ y: y,
41
+ width: width,
42
+ height: height,
43
+ stroke_color: current_color,
44
+ fill_color: current_fill_color,
45
+ line_width: current_line_width
46
+ }
47
+ end
48
+ end
49
+ end
50
+
51
+ rectangles
52
+ end
53
+ end
@@ -0,0 +1,36 @@
1
+ require_relative 'data_reader_base'
2
+
3
+ class RoomsDataReader
4
+ include DataReaderBase
5
+
6
+ FIELD_MAPPINGS = {
7
+ 'In Good Condition' => {
8
+ key_path: ['rooms', 'classrooms', 'good_condition'],
9
+ value_type: :integer,
10
+ end_pattern: /Needs Minor/
11
+ },
12
+ 'Needs Minor Repair' => {
13
+ key_path: ['rooms', 'classrooms', 'needs_minor_repair'],
14
+ value_type: :integer,
15
+ end_pattern: /Needs Major/
16
+ },
17
+ 'Needs Major Repair' => {
18
+ key_path: ['rooms', 'classrooms', 'needs_major_repair'],
19
+ value_type: :integer,
20
+ end_pattern: /Other Rooms/
21
+ },
22
+ 'Other Rooms' => {
23
+ key_path: ['rooms', 'other'],
24
+ value_type: :integer,
25
+ end_pattern: /Library/
26
+ },
27
+ 'Library Availability' => {
28
+ key_path: ['rooms', 'library'],
29
+ end_pattern: /Solar/
30
+ },
31
+ 'Separate Room for HM' => {
32
+ key_path: ['rooms', 'hm'],
33
+ end_pattern: /Drinking/
34
+ }
35
+ }
36
+ end
@@ -0,0 +1,118 @@
1
+ class RteDataReader
2
+ GRADES = [
3
+ 'Pre-Pri.', 'Class I', 'Class II', 'Class III', 'Class IV', 'Class V',
4
+ 'Class VI', 'Class VII', 'Class VIII'
5
+ ]
6
+
7
+ def self.read(csv_path) = new(csv_path).read
8
+
9
+ def initialize(csv_path)
10
+ @csv_path = csv_path
11
+ @rows = Hash.new { |h, k| h[k] = [] }
12
+
13
+ # Group cells by rect_y and rect_x
14
+ CSV.foreach(@csv_path, headers: true) do |cell|
15
+ next unless cell['page'] == '1'
16
+
17
+ rect_y = cell['rect_y'].to_f
18
+ @rows[rect_y] << cell
19
+ end
20
+
21
+ # Find the title row
22
+ @title_row = @rows.find { |_, cells| cells.any? { |cell| cell&.dig('text')&.include?('Total no. of Students Enrolled Under Section 12 of the RTE Act In Private Unaided and Specified Category Schools') } }
23
+
24
+ title_y = @title_row&.first
25
+ return unless title_y
26
+
27
+ # Get all rows below title in descending order
28
+ rows_after_title = @rows.select { |y, _| y < title_y.to_f }
29
+ .sort_by(&:first)
30
+ .reverse
31
+
32
+ # Get the next 3 rows after title
33
+ return unless rows_after_title.size >= 3
34
+
35
+ @grades_row = rows_after_title[0].last
36
+ @bg_row = rows_after_title[1].last
37
+ @values_row = rows_after_title[2].last
38
+
39
+ # Sort cells within each row by x coordinate
40
+ [@grades_row, @bg_row].each do |row|
41
+ next unless row
42
+ row.sort_by! { |cell| cell['text_x'].to_f }
43
+ end
44
+
45
+ # For values row, ensure we have a value for each B/G pair
46
+ if @values_row && @bg_row
47
+ sorted_values = []
48
+ @bg_row.each_slice(2) do |b, g|
49
+ b_x = b['text_x'].to_f
50
+ g_x = g['text_x'].to_f
51
+
52
+ # Find or create value for boys
53
+ b_val = @values_row.find { |cell| (cell['text_x'].to_f - b_x).abs < 10.0 }
54
+ b_val ||= { 'text' => '-', 'text_x' => b_x }
55
+ sorted_values << b_val
56
+
57
+ # Find or create value for girls
58
+ g_val = @values_row.find { |cell| (cell['text_x'].to_f - g_x).abs < 10.0 }
59
+ g_val ||= { 'text' => '-', 'text_x' => g_x }
60
+ sorted_values << g_val
61
+ end
62
+ @values_row = sorted_values
63
+ end
64
+
65
+ # Normalize empty values to "-"
66
+ @values_row&.each { |cell| cell['text'] = '-' if cell['text'].strip.empty? }
67
+
68
+ # Ensure we have all grades
69
+ found_grades = @grades_row.map { |cell| cell['text'] }
70
+ missing_grades = GRADES - found_grades
71
+ if missing_grades.any?
72
+ # Removed puts statement
73
+ end
74
+ end
75
+
76
+ def read
77
+ return nil unless @grades_row && @bg_row && @values_row
78
+
79
+ # Group B,G pairs, ensuring we have complete pairs
80
+ bg_pairs = {}
81
+ @bg_row.each_slice(2) do |pair|
82
+ next unless pair.size == 2 && pair[0] && pair[1] # Skip incomplete pairs
83
+ b, g = pair
84
+ x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
85
+ bg_pairs[x_mid] = [b, g]
86
+ end
87
+
88
+ # Match numbers to pairs
89
+ {
90
+ grade_rows: @grades_row,
91
+ bg_pairs: bg_pairs,
92
+ rte_numbers: match_numbers_to_pairs(@values_row, bg_pairs),
93
+ }
94
+ end
95
+
96
+ private
97
+ def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
98
+ numbers = {}
99
+ remaining = remaining_numbers.dup
100
+
101
+ bg_pairs.each do |x_mid, bg_pair|
102
+ next unless bg_pair && bg_pair.size == 2 # Skip invalid pairs
103
+ b_x = bg_pair[0]['text_x'].to_f
104
+ g_x = bg_pair[1]['text_x'].to_f
105
+
106
+ # Find numbers closest to B and G positions
107
+ b_num = remaining.find { |cell| (cell['text_x'].to_f - b_x).abs < threshold }
108
+ remaining.delete(b_num) if b_num
109
+
110
+ g_num = remaining.find { |cell| (cell['text_x'].to_f - g_x).abs < threshold }
111
+ remaining.delete(g_num) if g_num
112
+
113
+ numbers[x_mid] = [b_num, g_num]
114
+ end
115
+
116
+ numbers
117
+ end
118
+ end
@@ -0,0 +1,63 @@
1
+ class RteHtmlWriter
2
+ def self.generate_html(data, html_path)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = [
9
+ ['EWS', data[:ews_numbers] || {}],
10
+ ]
11
+
12
+ # Generate table rows for all categories
13
+ table_rows = categories.map do |category, numbers|
14
+ cells = bg_pairs.map do |x_mid, _|
15
+ nums = numbers[x_mid.to_s] || numbers[x_mid] || []
16
+ b_num = nums&.first
17
+ g_num = nums&.last
18
+ "<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
19
+ end.join
20
+
21
+ " <tr>\n" \
22
+ " <td class=\"category\">#{category}</td>\n" \
23
+ " #{cells}\n" \
24
+ " </tr>"
25
+ end.join("\n")
26
+
27
+ # Generate grade headers
28
+ grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
29
+ bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
30
+
31
+ html_content = <<~HTML
32
+ <!DOCTYPE html>
33
+ <html>
34
+ <head>
35
+ <title>Enrollment Table</title>
36
+ <style>
37
+ table { border-collapse: collapse; margin-top: 20px; width: 100%; }
38
+ th, td { border: 1px solid black; padding: 8px; text-align: center; }
39
+ .header { font-weight: bold; background-color: #f0f0f0; }
40
+ .grade { font-weight: bold; background-color: #e0e0e0; }
41
+ .bg-pair { background-color: #f8f8f8; }
42
+ .category { font-weight: bold; text-align: left; }
43
+ </style>
44
+ </head>
45
+ <body>
46
+ <h2>Enrolment (By Social Category)</h2>
47
+ <table>
48
+ <tr class="grade">
49
+ <th rowspan="2">Category</th>
50
+ #{grade_headers}
51
+ </tr>
52
+ <tr class="bg-pair">
53
+ #{bg_headers}
54
+ </tr>
55
+ #{table_rows}
56
+ </table>
57
+ </body>
58
+ </html>
59
+ HTML
60
+
61
+ File.write(html_path, html_content)
62
+ end
63
+ end
@@ -0,0 +1,61 @@
1
+ class RteYamlWriter
2
+ GRADE_MAPPING = {
3
+ 'Pre-Pri.' => 'pre-pri.',
4
+ 'Class I' => 'class_i',
5
+ 'Class II' => 'class_ii',
6
+ 'Class III' => 'class_iii',
7
+ 'Class IV' => 'class_iv',
8
+ 'Class V' => 'class_v',
9
+ 'Class VI' => 'class_vi',
10
+ 'Class VII' => 'class_vii',
11
+ 'Class VIII' => 'class_viii',
12
+ 'Class IX' => 'class_ix',
13
+ 'Class X' => 'class_x',
14
+ 'Class XI' => 'class_xi',
15
+ 'Class XII' => 'class_xii'
16
+ }
17
+
18
+ def self.format_yaml(data)
19
+ return unless data
20
+
21
+ rte_data = { 'rte' => {} }
22
+
23
+ # Get grade names and their indices
24
+ grades = data[:grade_rows].map { |row| row['text'] }
25
+ grade_indices = {}
26
+ grades.each_with_index do |grade, idx|
27
+ grade_indices[idx] = GRADE_MAPPING[grade] || grade.downcase.gsub(/\s+/, '_')
28
+ end
29
+
30
+ # Initialize structure for each grade
31
+ grade_indices.values.each do |grade_key|
32
+ rte_data['rte'][grade_key] = {
33
+ 'boys' => 0,
34
+ 'girls' => 0
35
+ }
36
+ end
37
+
38
+ # Fill in values
39
+ data[:rte_numbers].each do |x_mid, pair|
40
+ next unless pair && pair.size == 2
41
+
42
+ # Find corresponding grade index based on x position
43
+ grade_idx = grade_indices.keys.find do |idx|
44
+ x_start = data[:grade_rows][idx]['rect_x'].to_f
45
+ x_end = x_start + data[:grade_rows][idx]['rect_width'].to_f
46
+ x_mid >= x_start && x_mid <= x_end
47
+ end
48
+
49
+ next unless grade_idx && grade_indices[grade_idx]
50
+
51
+ grade_key = grade_indices[grade_idx]
52
+ boys_val = pair[0]&.dig('text')
53
+ girls_val = pair[1]&.dig('text')
54
+
55
+ rte_data['rte'][grade_key]['boys'] = boys_val == '-' ? 0 : boys_val.to_i
56
+ rte_data['rte'][grade_key]['girls'] = girls_val == '-' ? 0 : girls_val.to_i
57
+ end
58
+
59
+ rte_data
60
+ end
61
+ end
@@ -0,0 +1,56 @@
1
+ require_relative 'data_reader_base'
2
+
3
+ class SanitationDataReader
4
+ include DataReaderBase
5
+
6
+ FIELD_MAPPINGS = {
7
+ 'Handwash Near Toilet' => {
8
+ key_path: ['sanitation', 'handwash', 'near_toilet'],
9
+ end_pattern: /Handwash Facility/
10
+ },
11
+ 'Handwash Facility for Meal' => {
12
+ key_path: ['sanitation', 'handwash', 'for_meal'],
13
+ end_pattern: /Total Class/
14
+ },
15
+ 'Toilets' => {
16
+ key_path: ['sanitation', 'toilets'],
17
+ is_table: true,
18
+ table_config: {
19
+ sections: [
20
+ {
21
+ trigger: /Total.*CWSN/,
22
+ offset: 1,
23
+ fields: [
24
+ { key: ['boys', 'total'], value_type: :integer },
25
+ { key: ['girls', 'total'], value_type: :integer }
26
+ ]
27
+ },
28
+ {
29
+ trigger: "Functional",
30
+ offset: 1,
31
+ fields: [
32
+ { key: ['boys', 'functional'], value_type: :integer },
33
+ { key: ['girls', 'functional'], value_type: :integer }
34
+ ]
35
+ },
36
+ {
37
+ trigger: /CWSN Friendly/,
38
+ offset: 1,
39
+ fields: [
40
+ { key: ['boys', 'cwsn'], value_type: :integer },
41
+ { key: ['girls', 'cwsn'], value_type: :integer }
42
+ ]
43
+ },
44
+ {
45
+ trigger: "Urinal",
46
+ offset: 1,
47
+ fields: [
48
+ { key: ['boys', 'urinals'], value_type: :integer },
49
+ { key: ['girls', 'urinals'], value_type: :integer }
50
+ ]
51
+ }
52
+ ]
53
+ }
54
+ }
55
+ }
56
+ end
@@ -0,0 +1,295 @@
1
+ require 'pdf-reader'
2
+ require 'yaml'
3
+ require 'csv'
4
+ require 'fileutils'
5
+ require 'tempfile'
6
+
7
+ module UdiseSchoolReportReader
8
+ class SchoolReportParser
9
+ def self.extract_to_text(pdf_path, output_dir = nil, write_files = false)
10
+ raise ArgumentError, "PDF file not found" unless File.exist?(pdf_path)
11
+
12
+ # Extract all data first
13
+ extracted_data = extract_data(pdf_path)
14
+
15
+ # Write files if requested
16
+ write_output_files(pdf_path, extracted_data, output_dir) if write_files
17
+
18
+ # Return the YAML data
19
+ extracted_data[:yaml_data]
20
+ end
21
+
22
+ private
23
+
24
+ def self.extract_data(pdf_path)
25
+ reader = PDF::Reader.new(pdf_path)
26
+
27
+ # Extract raw content
28
+ content = reader.pages.map(&:raw_content).join("\n")
29
+ compressed_content = PDFContentCompressor.compress(content)
30
+
31
+ # Extract blocks and rectangles
32
+ blocks = PDFBlockExtractor.extract_blocks(reader)
33
+ rectangles = PDFRectangleExtractor.extract_rectangles(reader)
34
+ combined_data = BlockRectangleCombiner.combine(blocks, rectangles)
35
+
36
+ # Extract YAML data
37
+ yaml_data = extract_data_points(compressed_content)
38
+
39
+ # Create temporary file for combined data
40
+ temp_file = Tempfile.new(['combined', '.csv'])
41
+ begin
42
+ CSVWriter.write_combined(combined_data, temp_file.path)
43
+
44
+ # Extract table data using the temp file
45
+ enrollment_data = EnrollmentDataReader.read(temp_file.path)
46
+ ews_data = EwsDataReader.read(temp_file.path)
47
+ rte_data = RteDataReader.read(temp_file.path)
48
+
49
+ # Format table data for YAML
50
+ yaml_data['enrollment_data'] = EnrollmentYamlWriter.format_yaml(enrollment_data) if enrollment_data
51
+ yaml_data['ews_data'] = EwsYamlWriter.format_yaml(ews_data)
52
+ yaml_data['rte_data'] = RteYamlWriter.format_yaml(rte_data)
53
+
54
+ {
55
+ content: content,
56
+ compressed_content: compressed_content,
57
+ blocks: blocks,
58
+ rectangles: rectangles,
59
+ combined_data: combined_data,
60
+ enrollment_data: enrollment_data,
61
+ ews_data: ews_data,
62
+ rte_data: rte_data,
63
+ yaml_data: yaml_data
64
+ }
65
+ ensure
66
+ temp_file.close
67
+ temp_file.unlink
68
+ end
69
+ end
70
+
71
+ def self.write_output_files(pdf_path, data, output_dir)
72
+ paths = OutputPaths.new(pdf_path, output_dir)
73
+
74
+ # Write text files
75
+ File.write(paths.txt, data[:content])
76
+ File.write(paths.compressed_txt, data[:compressed_content])
77
+
78
+ # Write CSV files
79
+ CSVWriter.write_blocks(data[:blocks], paths.blocks_csv)
80
+ CSVWriter.write_rectangles(data[:rectangles], paths.rects_csv)
81
+ CSVWriter.write_combined(data[:combined_data], paths.combined_csv)
82
+
83
+ # Write HTML files
84
+ RteHtmlWriter.generate_html(data[:rte_data], paths.rte_html)
85
+ EnrollmentHtmlWriter.generate_html(data[:enrollment_data], paths.enrollment_html)
86
+ EwsHtmlWriter.generate_html(data[:ews_data], paths.ews_html)
87
+
88
+ # Write YAML file
89
+ File.write(paths.yaml, data[:yaml_data].to_yaml)
90
+ end
91
+
92
+ class OutputPaths
93
+ EXTENSIONS = {
94
+ txt: '.txt',
95
+ compressed_txt: '_compressed.txt',
96
+ blocks_csv: '_blocks.csv',
97
+ rects_csv: '_rects.csv',
98
+ combined_csv: '_combined.csv',
99
+ rte_html: '_rte.html',
100
+ enrollment_html: '_enrollment.html',
101
+ ews_html: '_ews.html',
102
+ yaml: '.yml'
103
+ }
104
+
105
+ def initialize(pdf_path, output_dir)
106
+ @pdf_path = pdf_path
107
+ @output_dir = output_dir
108
+ @base_name = File.basename(pdf_path, '.pdf')
109
+ end
110
+
111
+ EXTENSIONS.each do |name, ext|
112
+ define_method(name) do
113
+ if @output_dir
114
+ File.join(@output_dir, "#{@base_name}#{ext}")
115
+ elsif name == :yaml
116
+ File.join(File.dirname(@pdf_path), "#{@base_name}#{ext}")
117
+ else
118
+ tmp_dir = File.join(File.expand_path('.'), 'tmp')
119
+ FileUtils.mkdir_p(tmp_dir)
120
+ File.join(tmp_dir, "#{@base_name}#{ext}")
121
+ end
122
+ end
123
+ end
124
+ end
125
+
126
+ def self.extract_data_points(compressed_content)
127
+ lines = compressed_content.split("\n").map { |line| line.strip.gsub(/\\/, '') } # Remove escape characters
128
+
129
+ # Load template as base structure
130
+ data = YAML.load_file('template.yml')
131
+
132
+ # Extract data using readers
133
+ basic_info_data = BasicInfoDataReader.read(lines)
134
+ location_data = LocationDataReader.read(lines)
135
+ official_data = OfficialDataReader.read(lines)
136
+ characteristics_data = CharacteristicsReader.read(lines)
137
+ digital_facilities_data = DigitalFacilitiesDataReader.read(lines)
138
+ anganwadi_data = AnganwadiDataReader.read(lines)
139
+ building_data = BuildingDataReader.read(lines)
140
+ rooms_data = RoomsDataReader.read(lines)
141
+ teacher_data = TeacherDataReader.read(lines)
142
+ sanitation_data = SanitationDataReader.read(lines)
143
+ activities_data = ActivitiesDataReader.read(lines)
144
+
145
+ # Merge data from readers
146
+ data.merge!(basic_info_data) if basic_info_data
147
+ data.merge!(location_data) if location_data
148
+ data.merge!(official_data) if official_data
149
+ data.merge!(characteristics_data) if characteristics_data
150
+ data.merge!(digital_facilities_data) if digital_facilities_data
151
+ data.merge!(anganwadi_data) if anganwadi_data
152
+ data.merge!(building_data) if building_data
153
+ data.merge!(rooms_data) if rooms_data
154
+ data.merge!(teacher_data) if teacher_data
155
+ data.merge!(activities_data) if activities_data
156
+ data.merge!(sanitation_data) if sanitation_data
157
+
158
+ lines.each_with_index do |line, i|
159
+ next_line = lines[i + 1]&.strip
160
+
161
+ case line
162
+ # Basic Facilities
163
+ when "Drinking Water Available"
164
+ data['facilities']['basic']['water']['available'] = next_line if next_line && !next_line.match?(/Drinking Water Fun/)
165
+ when "Drinking Water Functional"
166
+ data['facilities']['basic']['water']['functional'] = next_line if next_line && !next_line.match?(/Rain/)
167
+ when "Rain Water Harvesting"
168
+ data['facilities']['basic']['water']['rain_water_harvesting'] = next_line if next_line && !next_line.match?(/Playground/)
169
+ when "Playground Available"
170
+ data['facilities']['basic']['safety']['playground'] = next_line if next_line && !next_line.match?(/Furniture/)
171
+ when "Electricity Availability"
172
+ data['facilities']['basic']['electricity']['available'] = next_line if next_line && !next_line.match?(/Solar/)
173
+ when "Solar Panel"
174
+ data['facilities']['basic']['electricity']['solar_panel'] = next_line if next_line && !next_line.match?(/Medical/)
175
+ when "Furniture Availability"
176
+ if next_line =~ /^\d+$/
177
+ data['infrastructure']['furniture']['count'] = next_line.to_i
178
+ end
179
+
180
+ # Academic
181
+ when /^Medium (\d)$/
182
+ medium_num = $1
183
+ if next_line && next_line =~ /^(\d+)-(.+)$/
184
+ code = $1
185
+ name = $2.strip
186
+ data['academic']['medium_of_instruction']["medium_#{medium_num}"] = {
187
+ 'code' => code,
188
+ 'name' => name
189
+ }
190
+ end
191
+
192
+ when "CCE"
193
+ if next_line
194
+ data['academic']['assessments']['cce']['implemented']['primary'] = next_line
195
+ data['academic']['assessments']['cce']['implemented']['upper_primary'] = lines[i + 2] if lines[i + 2]
196
+ data['academic']['assessments']['cce']['implemented']['secondary'] = lines[i + 3] if lines[i + 3]
197
+ data['academic']['assessments']['cce']['implemented']['higher_secondary'] = lines[i + 4] if lines[i + 4]
198
+ end
199
+
200
+ # Residential Info
201
+ when "Residential School"
202
+ if next_line && next_line =~ /^(\d+)\s*-\s*(.+)$/
203
+ code = $1
204
+ type = $2.strip
205
+ data['facilities']['residential']['details']['type'] = "#{code} - #{type}"
206
+ end
207
+ when "Residential Type"
208
+ data['facilities']['residential']['details']['category'] = next_line if next_line && !next_line.match?(/Minority/)
209
+ when "Minority School"
210
+ data['facilities']['residential']['details']['minority_school'] = next_line if next_line && !next_line.match?(/Approachable/)
211
+ when "Approachable By All Weather Road"
212
+ data['facilities']['basic']['safety']['all_weather_road'] = next_line if next_line && !next_line.match?(/Toilets/)
213
+
214
+ # Student Facilities
215
+ when /No\.of Students Received/
216
+ # Skip the header lines
217
+ i += 2 # Skip "Primary" and "Up.Primary" lines
218
+ when /Free text books/
219
+ if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
220
+ data['students']['facilities']['incentives']['free_textbooks']['primary'] = lines[i + 1].to_i
221
+ data['students']['facilities']['incentives']['free_textbooks']['upper_primary'] = lines[i + 2].to_i
222
+ end
223
+ when /Transport/
224
+ if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
225
+ data['students']['facilities']['general']['transport']['primary'] = lines[i + 1].to_i
226
+ data['students']['facilities']['general']['transport']['upper_primary'] = lines[i + 2].to_i
227
+ end
228
+ when /Free uniform/
229
+ if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
230
+ data['students']['facilities']['incentives']['free_uniform']['primary'] = lines[i + 1].to_i
231
+ data['students']['facilities']['incentives']['free_uniform']['upper_primary'] = lines[i + 2].to_i
232
+ end
233
+
234
+ # Committees
235
+ when "SMC Exists"
236
+ data['committees']['smc']['details']['exists'] = next_line if next_line && !next_line.match?(/SMC & SMDC/)
237
+ when "SMC & SMDC Same"
238
+ data['committees']['smc']['details']['same_as_smdc'] = next_line if next_line && !next_line.match?(/SMDC Con/)
239
+ when "SMDC Constituted"
240
+ data['committees']['smdc']['details']['constituted'] = next_line if next_line && !next_line.match?(/Text Books/)
241
+
242
+ # Grants
243
+ when "Grants Receipt"
244
+ if next_line =~ /^\d+\.?\d*$/
245
+ data['grants']['received']['amount'] = next_line.to_f
246
+ end
247
+ when "Grants Expenditure"
248
+ if next_line =~ /^\d+\.?\d*$/
249
+ data['grants']['expenditure']['amount'] = next_line.to_f
250
+ end
251
+
252
+ # Medical facilities
253
+ when "Medical checkups"
254
+ data['facilities']['medical']['checkups']['available'] = next_line if next_line
255
+
256
+ # Sports facilities
257
+ when "Sports Equipment"
258
+ data['academic']['sports']['equipment']['available'] = next_line if next_line
259
+ when "Physical Education Teacher"
260
+ data['academic']['sports']['instructors']['available'] = next_line if next_line
261
+
262
+ # Safety measures
263
+ when "Fire Extinguisher"
264
+ data['facilities']['safety']['fire']['equipment']['extinguisher'] = next_line if next_line
265
+ when "Emergency Exit"
266
+ data['facilities']['safety']['emergency']['exits']['available'] = next_line if next_line
267
+ when "Security Guard"
268
+ data['facilities']['safety']['security']['personnel']['guard'] = next_line if next_line
269
+
270
+ # Committee meetings
271
+ when "SMC Meetings Conducted"
272
+ data['committees']['smc']['details']['meetings']['count'] = next_line.to_i if next_line =~ /^\d+$/
273
+ when "SMDC Meetings Conducted"
274
+ data['committees']['smdc']['details']['meetings']['count'] = next_line.to_i if next_line =~ /^\d+$/
275
+
276
+ # Vocational courses
277
+ when "Vocational Courses"
278
+ data['academic']['vocational']['courses']['available'] = next_line if next_line
279
+ when "Vocational Trainer"
280
+ data['academic']['vocational']['trainers']['available'] = next_line if next_line
281
+ end
282
+ end
283
+
284
+ # Clean up empty sections
285
+ data.each do |_, section|
286
+ if section.is_a?(Hash)
287
+ section.reject! { |_, v| v.nil? || (v.is_a?(Hash) && v.empty?) }
288
+ end
289
+ end
290
+ data.reject! { |_, v| v.nil? || (v.is_a?(Hash) && v.empty?) }
291
+
292
+ data
293
+ end
294
+ end
295
+ end