udise_school_report_reader 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +20 -0
- data/LICENSE.txt +21 -0
- data/README.md +45 -0
- data/lib/udise_school_report_reader/activities_data_reader.rb +58 -0
- data/lib/udise_school_report_reader/anganwadi_data_reader.rb +22 -0
- data/lib/udise_school_report_reader/basic_info_data_reader.rb +29 -0
- data/lib/udise_school_report_reader/block_rectangle_combiner.rb +115 -0
- data/lib/udise_school_report_reader/building_data_reader.rb +36 -0
- data/lib/udise_school_report_reader/characteristics_reader.rb +28 -0
- data/lib/udise_school_report_reader/csv_writer.rb +75 -0
- data/lib/udise_school_report_reader/data_reader_base.rb +86 -0
- data/lib/udise_school_report_reader/digital_facilities_data_reader.rb +42 -0
- data/lib/udise_school_report_reader/enrollment_data_reader.rb +136 -0
- data/lib/udise_school_report_reader/enrollment_html_writer.rb +81 -0
- data/lib/udise_school_report_reader/enrollment_yaml_writer.rb +62 -0
- data/lib/udise_school_report_reader/ews_data_reader.rb +118 -0
- data/lib/udise_school_report_reader/ews_html_writer.rb +63 -0
- data/lib/udise_school_report_reader/ews_yaml_writer.rb +31 -0
- data/lib/udise_school_report_reader/location_data_reader.rb +47 -0
- data/lib/udise_school_report_reader/official_data_reader.rb +40 -0
- data/lib/udise_school_report_reader/pdf_block_extractor.rb +49 -0
- data/lib/udise_school_report_reader/pdf_content_compressor.rb +36 -0
- data/lib/udise_school_report_reader/pdf_rectangle_extractor.rb +53 -0
- data/lib/udise_school_report_reader/rooms_data_reader.rb +36 -0
- data/lib/udise_school_report_reader/rte_data_reader.rb +118 -0
- data/lib/udise_school_report_reader/rte_html_writer.rb +63 -0
- data/lib/udise_school_report_reader/rte_yaml_writer.rb +61 -0
- data/lib/udise_school_report_reader/sanitation_data_reader.rb +56 -0
- data/lib/udise_school_report_reader/school_report_parser.rb +295 -0
- data/lib/udise_school_report_reader/teacher_data_reader.rb +204 -0
- data/lib/udise_school_report_reader/version.rb +3 -0
- data/lib/udise_school_report_reader.rb +41 -0
- data/test/school_report_parser_test.rb +62 -0
- metadata +165 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
class PDFRectangleExtractor
|
2
|
+
def self.extract_rectangles(reader)
|
3
|
+
raise ArgumentError, "PDF reader cannot be nil" if reader.nil?
|
4
|
+
|
5
|
+
rectangles = []
|
6
|
+
current_color = '0 G' # Default stroke color (black)
|
7
|
+
current_fill_color = '1 1 1 rg' # Default fill color (white)
|
8
|
+
current_line_width = 1.0 # Default line width
|
9
|
+
|
10
|
+
reader.pages.each_with_index do |page, index|
|
11
|
+
page_number = index + 1
|
12
|
+
|
13
|
+
page.raw_content.each_line do |line|
|
14
|
+
# Track stroke color changes
|
15
|
+
if line.match?(/[\d.]+ [\d.]+ [\d.]+ RG/) || line.match?(/[\d.]+ G/)
|
16
|
+
current_color = line.strip
|
17
|
+
end
|
18
|
+
|
19
|
+
# Track fill color changes
|
20
|
+
if line.match?(/[\d.]+ [\d.]+ [\d.]+ rg/) || line.match?(/[\d.]+ g/)
|
21
|
+
current_fill_color = line.strip
|
22
|
+
end
|
23
|
+
|
24
|
+
# Track line width changes
|
25
|
+
if line.match?(/[\d.]+\s+w/)
|
26
|
+
if match = line.match(/(\d+\.?\d*)\s+w/)
|
27
|
+
current_line_width = match[1].to_f
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Look for rectangles (table cells)
|
32
|
+
if line.match?(/(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+re/)
|
33
|
+
matches = line.match(/(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+re/)
|
34
|
+
x, y, width, height = matches[1..4].map(&:to_f)
|
35
|
+
|
36
|
+
# Store the rectangle with its properties
|
37
|
+
rectangles << {
|
38
|
+
page: page_number,
|
39
|
+
x: x,
|
40
|
+
y: y,
|
41
|
+
width: width,
|
42
|
+
height: height,
|
43
|
+
stroke_color: current_color,
|
44
|
+
fill_color: current_fill_color,
|
45
|
+
line_width: current_line_width
|
46
|
+
}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
rectangles
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require_relative 'data_reader_base'
|
2
|
+
|
3
|
+
class RoomsDataReader
|
4
|
+
include DataReaderBase
|
5
|
+
|
6
|
+
FIELD_MAPPINGS = {
|
7
|
+
'In Good Condition' => {
|
8
|
+
key_path: ['rooms', 'classrooms', 'good_condition'],
|
9
|
+
value_type: :integer,
|
10
|
+
end_pattern: /Needs Minor/
|
11
|
+
},
|
12
|
+
'Needs Minor Repair' => {
|
13
|
+
key_path: ['rooms', 'classrooms', 'needs_minor_repair'],
|
14
|
+
value_type: :integer,
|
15
|
+
end_pattern: /Needs Major/
|
16
|
+
},
|
17
|
+
'Needs Major Repair' => {
|
18
|
+
key_path: ['rooms', 'classrooms', 'needs_major_repair'],
|
19
|
+
value_type: :integer,
|
20
|
+
end_pattern: /Other Rooms/
|
21
|
+
},
|
22
|
+
'Other Rooms' => {
|
23
|
+
key_path: ['rooms', 'other'],
|
24
|
+
value_type: :integer,
|
25
|
+
end_pattern: /Library/
|
26
|
+
},
|
27
|
+
'Library Availability' => {
|
28
|
+
key_path: ['rooms', 'library'],
|
29
|
+
end_pattern: /Solar/
|
30
|
+
},
|
31
|
+
'Separate Room for HM' => {
|
32
|
+
key_path: ['rooms', 'hm'],
|
33
|
+
end_pattern: /Drinking/
|
34
|
+
}
|
35
|
+
}
|
36
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
class RteDataReader
|
2
|
+
GRADES = [
|
3
|
+
'Pre-Pri.', 'Class I', 'Class II', 'Class III', 'Class IV', 'Class V',
|
4
|
+
'Class VI', 'Class VII', 'Class VIII'
|
5
|
+
]
|
6
|
+
|
7
|
+
def self.read(csv_path) = new(csv_path).read
|
8
|
+
|
9
|
+
def initialize(csv_path)
|
10
|
+
@csv_path = csv_path
|
11
|
+
@rows = Hash.new { |h, k| h[k] = [] }
|
12
|
+
|
13
|
+
# Group cells by rect_y and rect_x
|
14
|
+
CSV.foreach(@csv_path, headers: true) do |cell|
|
15
|
+
next unless cell['page'] == '1'
|
16
|
+
|
17
|
+
rect_y = cell['rect_y'].to_f
|
18
|
+
@rows[rect_y] << cell
|
19
|
+
end
|
20
|
+
|
21
|
+
# Find the title row
|
22
|
+
@title_row = @rows.find { |_, cells| cells.any? { |cell| cell&.dig('text')&.include?('Total no. of Students Enrolled Under Section 12 of the RTE Act In Private Unaided and Specified Category Schools') } }
|
23
|
+
|
24
|
+
title_y = @title_row&.first
|
25
|
+
return unless title_y
|
26
|
+
|
27
|
+
# Get all rows below title in descending order
|
28
|
+
rows_after_title = @rows.select { |y, _| y < title_y.to_f }
|
29
|
+
.sort_by(&:first)
|
30
|
+
.reverse
|
31
|
+
|
32
|
+
# Get the next 3 rows after title
|
33
|
+
return unless rows_after_title.size >= 3
|
34
|
+
|
35
|
+
@grades_row = rows_after_title[0].last
|
36
|
+
@bg_row = rows_after_title[1].last
|
37
|
+
@values_row = rows_after_title[2].last
|
38
|
+
|
39
|
+
# Sort cells within each row by x coordinate
|
40
|
+
[@grades_row, @bg_row].each do |row|
|
41
|
+
next unless row
|
42
|
+
row.sort_by! { |cell| cell['text_x'].to_f }
|
43
|
+
end
|
44
|
+
|
45
|
+
# For values row, ensure we have a value for each B/G pair
|
46
|
+
if @values_row && @bg_row
|
47
|
+
sorted_values = []
|
48
|
+
@bg_row.each_slice(2) do |b, g|
|
49
|
+
b_x = b['text_x'].to_f
|
50
|
+
g_x = g['text_x'].to_f
|
51
|
+
|
52
|
+
# Find or create value for boys
|
53
|
+
b_val = @values_row.find { |cell| (cell['text_x'].to_f - b_x).abs < 10.0 }
|
54
|
+
b_val ||= { 'text' => '-', 'text_x' => b_x }
|
55
|
+
sorted_values << b_val
|
56
|
+
|
57
|
+
# Find or create value for girls
|
58
|
+
g_val = @values_row.find { |cell| (cell['text_x'].to_f - g_x).abs < 10.0 }
|
59
|
+
g_val ||= { 'text' => '-', 'text_x' => g_x }
|
60
|
+
sorted_values << g_val
|
61
|
+
end
|
62
|
+
@values_row = sorted_values
|
63
|
+
end
|
64
|
+
|
65
|
+
# Normalize empty values to "-"
|
66
|
+
@values_row&.each { |cell| cell['text'] = '-' if cell['text'].strip.empty? }
|
67
|
+
|
68
|
+
# Ensure we have all grades
|
69
|
+
found_grades = @grades_row.map { |cell| cell['text'] }
|
70
|
+
missing_grades = GRADES - found_grades
|
71
|
+
if missing_grades.any?
|
72
|
+
# Removed puts statement
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def read
|
77
|
+
return nil unless @grades_row && @bg_row && @values_row
|
78
|
+
|
79
|
+
# Group B,G pairs, ensuring we have complete pairs
|
80
|
+
bg_pairs = {}
|
81
|
+
@bg_row.each_slice(2) do |pair|
|
82
|
+
next unless pair.size == 2 && pair[0] && pair[1] # Skip incomplete pairs
|
83
|
+
b, g = pair
|
84
|
+
x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
|
85
|
+
bg_pairs[x_mid] = [b, g]
|
86
|
+
end
|
87
|
+
|
88
|
+
# Match numbers to pairs
|
89
|
+
{
|
90
|
+
grade_rows: @grades_row,
|
91
|
+
bg_pairs: bg_pairs,
|
92
|
+
rte_numbers: match_numbers_to_pairs(@values_row, bg_pairs),
|
93
|
+
}
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
|
98
|
+
numbers = {}
|
99
|
+
remaining = remaining_numbers.dup
|
100
|
+
|
101
|
+
bg_pairs.each do |x_mid, bg_pair|
|
102
|
+
next unless bg_pair && bg_pair.size == 2 # Skip invalid pairs
|
103
|
+
b_x = bg_pair[0]['text_x'].to_f
|
104
|
+
g_x = bg_pair[1]['text_x'].to_f
|
105
|
+
|
106
|
+
# Find numbers closest to B and G positions
|
107
|
+
b_num = remaining.find { |cell| (cell['text_x'].to_f - b_x).abs < threshold }
|
108
|
+
remaining.delete(b_num) if b_num
|
109
|
+
|
110
|
+
g_num = remaining.find { |cell| (cell['text_x'].to_f - g_x).abs < threshold }
|
111
|
+
remaining.delete(g_num) if g_num
|
112
|
+
|
113
|
+
numbers[x_mid] = [b_num, g_num]
|
114
|
+
end
|
115
|
+
|
116
|
+
numbers
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
class RteHtmlWriter
|
2
|
+
def self.generate_html(data, html_path)
|
3
|
+
return unless data
|
4
|
+
|
5
|
+
grade_rows = data[:grade_rows]
|
6
|
+
bg_pairs = data[:bg_pairs]
|
7
|
+
|
8
|
+
categories = [
|
9
|
+
['EWS', data[:ews_numbers] || {}],
|
10
|
+
]
|
11
|
+
|
12
|
+
# Generate table rows for all categories
|
13
|
+
table_rows = categories.map do |category, numbers|
|
14
|
+
cells = bg_pairs.map do |x_mid, _|
|
15
|
+
nums = numbers[x_mid.to_s] || numbers[x_mid] || []
|
16
|
+
b_num = nums&.first
|
17
|
+
g_num = nums&.last
|
18
|
+
"<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
|
19
|
+
end.join
|
20
|
+
|
21
|
+
" <tr>\n" \
|
22
|
+
" <td class=\"category\">#{category}</td>\n" \
|
23
|
+
" #{cells}\n" \
|
24
|
+
" </tr>"
|
25
|
+
end.join("\n")
|
26
|
+
|
27
|
+
# Generate grade headers
|
28
|
+
grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
|
29
|
+
bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
|
30
|
+
|
31
|
+
html_content = <<~HTML
|
32
|
+
<!DOCTYPE html>
|
33
|
+
<html>
|
34
|
+
<head>
|
35
|
+
<title>Enrollment Table</title>
|
36
|
+
<style>
|
37
|
+
table { border-collapse: collapse; margin-top: 20px; width: 100%; }
|
38
|
+
th, td { border: 1px solid black; padding: 8px; text-align: center; }
|
39
|
+
.header { font-weight: bold; background-color: #f0f0f0; }
|
40
|
+
.grade { font-weight: bold; background-color: #e0e0e0; }
|
41
|
+
.bg-pair { background-color: #f8f8f8; }
|
42
|
+
.category { font-weight: bold; text-align: left; }
|
43
|
+
</style>
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
<h2>Enrolment (By Social Category)</h2>
|
47
|
+
<table>
|
48
|
+
<tr class="grade">
|
49
|
+
<th rowspan="2">Category</th>
|
50
|
+
#{grade_headers}
|
51
|
+
</tr>
|
52
|
+
<tr class="bg-pair">
|
53
|
+
#{bg_headers}
|
54
|
+
</tr>
|
55
|
+
#{table_rows}
|
56
|
+
</table>
|
57
|
+
</body>
|
58
|
+
</html>
|
59
|
+
HTML
|
60
|
+
|
61
|
+
File.write(html_path, html_content)
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
class RteYamlWriter
|
2
|
+
GRADE_MAPPING = {
|
3
|
+
'Pre-Pri.' => 'pre-pri.',
|
4
|
+
'Class I' => 'class_i',
|
5
|
+
'Class II' => 'class_ii',
|
6
|
+
'Class III' => 'class_iii',
|
7
|
+
'Class IV' => 'class_iv',
|
8
|
+
'Class V' => 'class_v',
|
9
|
+
'Class VI' => 'class_vi',
|
10
|
+
'Class VII' => 'class_vii',
|
11
|
+
'Class VIII' => 'class_viii',
|
12
|
+
'Class IX' => 'class_ix',
|
13
|
+
'Class X' => 'class_x',
|
14
|
+
'Class XI' => 'class_xi',
|
15
|
+
'Class XII' => 'class_xii'
|
16
|
+
}
|
17
|
+
|
18
|
+
def self.format_yaml(data)
|
19
|
+
return unless data
|
20
|
+
|
21
|
+
rte_data = { 'rte' => {} }
|
22
|
+
|
23
|
+
# Get grade names and their indices
|
24
|
+
grades = data[:grade_rows].map { |row| row['text'] }
|
25
|
+
grade_indices = {}
|
26
|
+
grades.each_with_index do |grade, idx|
|
27
|
+
grade_indices[idx] = GRADE_MAPPING[grade] || grade.downcase.gsub(/\s+/, '_')
|
28
|
+
end
|
29
|
+
|
30
|
+
# Initialize structure for each grade
|
31
|
+
grade_indices.values.each do |grade_key|
|
32
|
+
rte_data['rte'][grade_key] = {
|
33
|
+
'boys' => 0,
|
34
|
+
'girls' => 0
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# Fill in values
|
39
|
+
data[:rte_numbers].each do |x_mid, pair|
|
40
|
+
next unless pair && pair.size == 2
|
41
|
+
|
42
|
+
# Find corresponding grade index based on x position
|
43
|
+
grade_idx = grade_indices.keys.find do |idx|
|
44
|
+
x_start = data[:grade_rows][idx]['rect_x'].to_f
|
45
|
+
x_end = x_start + data[:grade_rows][idx]['rect_width'].to_f
|
46
|
+
x_mid >= x_start && x_mid <= x_end
|
47
|
+
end
|
48
|
+
|
49
|
+
next unless grade_idx && grade_indices[grade_idx]
|
50
|
+
|
51
|
+
grade_key = grade_indices[grade_idx]
|
52
|
+
boys_val = pair[0]&.dig('text')
|
53
|
+
girls_val = pair[1]&.dig('text')
|
54
|
+
|
55
|
+
rte_data['rte'][grade_key]['boys'] = boys_val == '-' ? 0 : boys_val.to_i
|
56
|
+
rte_data['rte'][grade_key]['girls'] = girls_val == '-' ? 0 : girls_val.to_i
|
57
|
+
end
|
58
|
+
|
59
|
+
rte_data
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require_relative 'data_reader_base'
|
2
|
+
|
3
|
+
class SanitationDataReader
|
4
|
+
include DataReaderBase
|
5
|
+
|
6
|
+
FIELD_MAPPINGS = {
|
7
|
+
'Handwash Near Toilet' => {
|
8
|
+
key_path: ['sanitation', 'handwash', 'near_toilet'],
|
9
|
+
end_pattern: /Handwash Facility/
|
10
|
+
},
|
11
|
+
'Handwash Facility for Meal' => {
|
12
|
+
key_path: ['sanitation', 'handwash', 'for_meal'],
|
13
|
+
end_pattern: /Total Class/
|
14
|
+
},
|
15
|
+
'Toilets' => {
|
16
|
+
key_path: ['sanitation', 'toilets'],
|
17
|
+
is_table: true,
|
18
|
+
table_config: {
|
19
|
+
sections: [
|
20
|
+
{
|
21
|
+
trigger: /Total.*CWSN/,
|
22
|
+
offset: 1,
|
23
|
+
fields: [
|
24
|
+
{ key: ['boys', 'total'], value_type: :integer },
|
25
|
+
{ key: ['girls', 'total'], value_type: :integer }
|
26
|
+
]
|
27
|
+
},
|
28
|
+
{
|
29
|
+
trigger: "Functional",
|
30
|
+
offset: 1,
|
31
|
+
fields: [
|
32
|
+
{ key: ['boys', 'functional'], value_type: :integer },
|
33
|
+
{ key: ['girls', 'functional'], value_type: :integer }
|
34
|
+
]
|
35
|
+
},
|
36
|
+
{
|
37
|
+
trigger: /CWSN Friendly/,
|
38
|
+
offset: 1,
|
39
|
+
fields: [
|
40
|
+
{ key: ['boys', 'cwsn'], value_type: :integer },
|
41
|
+
{ key: ['girls', 'cwsn'], value_type: :integer }
|
42
|
+
]
|
43
|
+
},
|
44
|
+
{
|
45
|
+
trigger: "Urinal",
|
46
|
+
offset: 1,
|
47
|
+
fields: [
|
48
|
+
{ key: ['boys', 'urinals'], value_type: :integer },
|
49
|
+
{ key: ['girls', 'urinals'], value_type: :integer }
|
50
|
+
]
|
51
|
+
}
|
52
|
+
]
|
53
|
+
}
|
54
|
+
}
|
55
|
+
}
|
56
|
+
end
|
@@ -0,0 +1,295 @@
|
|
1
|
+
require 'pdf-reader'
|
2
|
+
require 'yaml'
|
3
|
+
require 'csv'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'tempfile'
|
6
|
+
|
7
|
+
module UdiseSchoolReportReader
|
8
|
+
class SchoolReportParser
|
9
|
+
def self.extract_to_text(pdf_path, output_dir = nil, write_files = false)
|
10
|
+
raise ArgumentError, "PDF file not found" unless File.exist?(pdf_path)
|
11
|
+
|
12
|
+
# Extract all data first
|
13
|
+
extracted_data = extract_data(pdf_path)
|
14
|
+
|
15
|
+
# Write files if requested
|
16
|
+
write_output_files(pdf_path, extracted_data, output_dir) if write_files
|
17
|
+
|
18
|
+
# Return the YAML data
|
19
|
+
extracted_data[:yaml_data]
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def self.extract_data(pdf_path)
|
25
|
+
reader = PDF::Reader.new(pdf_path)
|
26
|
+
|
27
|
+
# Extract raw content
|
28
|
+
content = reader.pages.map(&:raw_content).join("\n")
|
29
|
+
compressed_content = PDFContentCompressor.compress(content)
|
30
|
+
|
31
|
+
# Extract blocks and rectangles
|
32
|
+
blocks = PDFBlockExtractor.extract_blocks(reader)
|
33
|
+
rectangles = PDFRectangleExtractor.extract_rectangles(reader)
|
34
|
+
combined_data = BlockRectangleCombiner.combine(blocks, rectangles)
|
35
|
+
|
36
|
+
# Extract YAML data
|
37
|
+
yaml_data = extract_data_points(compressed_content)
|
38
|
+
|
39
|
+
# Create temporary file for combined data
|
40
|
+
temp_file = Tempfile.new(['combined', '.csv'])
|
41
|
+
begin
|
42
|
+
CSVWriter.write_combined(combined_data, temp_file.path)
|
43
|
+
|
44
|
+
# Extract table data using the temp file
|
45
|
+
enrollment_data = EnrollmentDataReader.read(temp_file.path)
|
46
|
+
ews_data = EwsDataReader.read(temp_file.path)
|
47
|
+
rte_data = RteDataReader.read(temp_file.path)
|
48
|
+
|
49
|
+
# Format table data for YAML
|
50
|
+
yaml_data['enrollment_data'] = EnrollmentYamlWriter.format_yaml(enrollment_data) if enrollment_data
|
51
|
+
yaml_data['ews_data'] = EwsYamlWriter.format_yaml(ews_data)
|
52
|
+
yaml_data['rte_data'] = RteYamlWriter.format_yaml(rte_data)
|
53
|
+
|
54
|
+
{
|
55
|
+
content: content,
|
56
|
+
compressed_content: compressed_content,
|
57
|
+
blocks: blocks,
|
58
|
+
rectangles: rectangles,
|
59
|
+
combined_data: combined_data,
|
60
|
+
enrollment_data: enrollment_data,
|
61
|
+
ews_data: ews_data,
|
62
|
+
rte_data: rte_data,
|
63
|
+
yaml_data: yaml_data
|
64
|
+
}
|
65
|
+
ensure
|
66
|
+
temp_file.close
|
67
|
+
temp_file.unlink
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.write_output_files(pdf_path, data, output_dir)
|
72
|
+
paths = OutputPaths.new(pdf_path, output_dir)
|
73
|
+
|
74
|
+
# Write text files
|
75
|
+
File.write(paths.txt, data[:content])
|
76
|
+
File.write(paths.compressed_txt, data[:compressed_content])
|
77
|
+
|
78
|
+
# Write CSV files
|
79
|
+
CSVWriter.write_blocks(data[:blocks], paths.blocks_csv)
|
80
|
+
CSVWriter.write_rectangles(data[:rectangles], paths.rects_csv)
|
81
|
+
CSVWriter.write_combined(data[:combined_data], paths.combined_csv)
|
82
|
+
|
83
|
+
# Write HTML files
|
84
|
+
RteHtmlWriter.generate_html(data[:rte_data], paths.rte_html)
|
85
|
+
EnrollmentHtmlWriter.generate_html(data[:enrollment_data], paths.enrollment_html)
|
86
|
+
EwsHtmlWriter.generate_html(data[:ews_data], paths.ews_html)
|
87
|
+
|
88
|
+
# Write YAML file
|
89
|
+
File.write(paths.yaml, data[:yaml_data].to_yaml)
|
90
|
+
end
|
91
|
+
|
92
|
+
class OutputPaths
|
93
|
+
EXTENSIONS = {
|
94
|
+
txt: '.txt',
|
95
|
+
compressed_txt: '_compressed.txt',
|
96
|
+
blocks_csv: '_blocks.csv',
|
97
|
+
rects_csv: '_rects.csv',
|
98
|
+
combined_csv: '_combined.csv',
|
99
|
+
rte_html: '_rte.html',
|
100
|
+
enrollment_html: '_enrollment.html',
|
101
|
+
ews_html: '_ews.html',
|
102
|
+
yaml: '.yml'
|
103
|
+
}
|
104
|
+
|
105
|
+
def initialize(pdf_path, output_dir)
|
106
|
+
@pdf_path = pdf_path
|
107
|
+
@output_dir = output_dir
|
108
|
+
@base_name = File.basename(pdf_path, '.pdf')
|
109
|
+
end
|
110
|
+
|
111
|
+
EXTENSIONS.each do |name, ext|
|
112
|
+
define_method(name) do
|
113
|
+
if @output_dir
|
114
|
+
File.join(@output_dir, "#{@base_name}#{ext}")
|
115
|
+
elsif name == :yaml
|
116
|
+
File.join(File.dirname(@pdf_path), "#{@base_name}#{ext}")
|
117
|
+
else
|
118
|
+
tmp_dir = File.join(File.expand_path('.'), 'tmp')
|
119
|
+
FileUtils.mkdir_p(tmp_dir)
|
120
|
+
File.join(tmp_dir, "#{@base_name}#{ext}")
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.extract_data_points(compressed_content)
|
127
|
+
lines = compressed_content.split("\n").map { |line| line.strip.gsub(/\\/, '') } # Remove escape characters
|
128
|
+
|
129
|
+
# Load template as base structure
|
130
|
+
data = YAML.load_file('template.yml')
|
131
|
+
|
132
|
+
# Extract data using readers
|
133
|
+
basic_info_data = BasicInfoDataReader.read(lines)
|
134
|
+
location_data = LocationDataReader.read(lines)
|
135
|
+
official_data = OfficialDataReader.read(lines)
|
136
|
+
characteristics_data = CharacteristicsReader.read(lines)
|
137
|
+
digital_facilities_data = DigitalFacilitiesDataReader.read(lines)
|
138
|
+
anganwadi_data = AnganwadiDataReader.read(lines)
|
139
|
+
building_data = BuildingDataReader.read(lines)
|
140
|
+
rooms_data = RoomsDataReader.read(lines)
|
141
|
+
teacher_data = TeacherDataReader.read(lines)
|
142
|
+
sanitation_data = SanitationDataReader.read(lines)
|
143
|
+
activities_data = ActivitiesDataReader.read(lines)
|
144
|
+
|
145
|
+
# Merge data from readers
|
146
|
+
data.merge!(basic_info_data) if basic_info_data
|
147
|
+
data.merge!(location_data) if location_data
|
148
|
+
data.merge!(official_data) if official_data
|
149
|
+
data.merge!(characteristics_data) if characteristics_data
|
150
|
+
data.merge!(digital_facilities_data) if digital_facilities_data
|
151
|
+
data.merge!(anganwadi_data) if anganwadi_data
|
152
|
+
data.merge!(building_data) if building_data
|
153
|
+
data.merge!(rooms_data) if rooms_data
|
154
|
+
data.merge!(teacher_data) if teacher_data
|
155
|
+
data.merge!(activities_data) if activities_data
|
156
|
+
data.merge!(sanitation_data) if sanitation_data
|
157
|
+
|
158
|
+
lines.each_with_index do |line, i|
|
159
|
+
next_line = lines[i + 1]&.strip
|
160
|
+
|
161
|
+
case line
|
162
|
+
# Basic Facilities
|
163
|
+
when "Drinking Water Available"
|
164
|
+
data['facilities']['basic']['water']['available'] = next_line if next_line && !next_line.match?(/Drinking Water Fun/)
|
165
|
+
when "Drinking Water Functional"
|
166
|
+
data['facilities']['basic']['water']['functional'] = next_line if next_line && !next_line.match?(/Rain/)
|
167
|
+
when "Rain Water Harvesting"
|
168
|
+
data['facilities']['basic']['water']['rain_water_harvesting'] = next_line if next_line && !next_line.match?(/Playground/)
|
169
|
+
when "Playground Available"
|
170
|
+
data['facilities']['basic']['safety']['playground'] = next_line if next_line && !next_line.match?(/Furniture/)
|
171
|
+
when "Electricity Availability"
|
172
|
+
data['facilities']['basic']['electricity']['available'] = next_line if next_line && !next_line.match?(/Solar/)
|
173
|
+
when "Solar Panel"
|
174
|
+
data['facilities']['basic']['electricity']['solar_panel'] = next_line if next_line && !next_line.match?(/Medical/)
|
175
|
+
when "Furniture Availability"
|
176
|
+
if next_line =~ /^\d+$/
|
177
|
+
data['infrastructure']['furniture']['count'] = next_line.to_i
|
178
|
+
end
|
179
|
+
|
180
|
+
# Academic
|
181
|
+
when /^Medium (\d)$/
|
182
|
+
medium_num = $1
|
183
|
+
if next_line && next_line =~ /^(\d+)-(.+)$/
|
184
|
+
code = $1
|
185
|
+
name = $2.strip
|
186
|
+
data['academic']['medium_of_instruction']["medium_#{medium_num}"] = {
|
187
|
+
'code' => code,
|
188
|
+
'name' => name
|
189
|
+
}
|
190
|
+
end
|
191
|
+
|
192
|
+
when "CCE"
|
193
|
+
if next_line
|
194
|
+
data['academic']['assessments']['cce']['implemented']['primary'] = next_line
|
195
|
+
data['academic']['assessments']['cce']['implemented']['upper_primary'] = lines[i + 2] if lines[i + 2]
|
196
|
+
data['academic']['assessments']['cce']['implemented']['secondary'] = lines[i + 3] if lines[i + 3]
|
197
|
+
data['academic']['assessments']['cce']['implemented']['higher_secondary'] = lines[i + 4] if lines[i + 4]
|
198
|
+
end
|
199
|
+
|
200
|
+
# Residential Info
|
201
|
+
when "Residential School"
|
202
|
+
if next_line && next_line =~ /^(\d+)\s*-\s*(.+)$/
|
203
|
+
code = $1
|
204
|
+
type = $2.strip
|
205
|
+
data['facilities']['residential']['details']['type'] = "#{code} - #{type}"
|
206
|
+
end
|
207
|
+
when "Residential Type"
|
208
|
+
data['facilities']['residential']['details']['category'] = next_line if next_line && !next_line.match?(/Minority/)
|
209
|
+
when "Minority School"
|
210
|
+
data['facilities']['residential']['details']['minority_school'] = next_line if next_line && !next_line.match?(/Approachable/)
|
211
|
+
when "Approachable By All Weather Road"
|
212
|
+
data['facilities']['basic']['safety']['all_weather_road'] = next_line if next_line && !next_line.match?(/Toilets/)
|
213
|
+
|
214
|
+
# Student Facilities
|
215
|
+
when /No\.of Students Received/
|
216
|
+
# Skip the header lines
|
217
|
+
i += 2 # Skip "Primary" and "Up.Primary" lines
|
218
|
+
when /Free text books/
|
219
|
+
if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
|
220
|
+
data['students']['facilities']['incentives']['free_textbooks']['primary'] = lines[i + 1].to_i
|
221
|
+
data['students']['facilities']['incentives']['free_textbooks']['upper_primary'] = lines[i + 2].to_i
|
222
|
+
end
|
223
|
+
when /Transport/
|
224
|
+
if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
|
225
|
+
data['students']['facilities']['general']['transport']['primary'] = lines[i + 1].to_i
|
226
|
+
data['students']['facilities']['general']['transport']['upper_primary'] = lines[i + 2].to_i
|
227
|
+
end
|
228
|
+
when /Free uniform/
|
229
|
+
if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
|
230
|
+
data['students']['facilities']['incentives']['free_uniform']['primary'] = lines[i + 1].to_i
|
231
|
+
data['students']['facilities']['incentives']['free_uniform']['upper_primary'] = lines[i + 2].to_i
|
232
|
+
end
|
233
|
+
|
234
|
+
# Committees
|
235
|
+
when "SMC Exists"
|
236
|
+
data['committees']['smc']['details']['exists'] = next_line if next_line && !next_line.match?(/SMC & SMDC/)
|
237
|
+
when "SMC & SMDC Same"
|
238
|
+
data['committees']['smc']['details']['same_as_smdc'] = next_line if next_line && !next_line.match?(/SMDC Con/)
|
239
|
+
when "SMDC Constituted"
|
240
|
+
data['committees']['smdc']['details']['constituted'] = next_line if next_line && !next_line.match?(/Text Books/)
|
241
|
+
|
242
|
+
# Grants
|
243
|
+
when "Grants Receipt"
|
244
|
+
if next_line =~ /^\d+\.?\d*$/
|
245
|
+
data['grants']['received']['amount'] = next_line.to_f
|
246
|
+
end
|
247
|
+
when "Grants Expenditure"
|
248
|
+
if next_line =~ /^\d+\.?\d*$/
|
249
|
+
data['grants']['expenditure']['amount'] = next_line.to_f
|
250
|
+
end
|
251
|
+
|
252
|
+
# Medical facilities
|
253
|
+
when "Medical checkups"
|
254
|
+
data['facilities']['medical']['checkups']['available'] = next_line if next_line
|
255
|
+
|
256
|
+
# Sports facilities
|
257
|
+
when "Sports Equipment"
|
258
|
+
data['academic']['sports']['equipment']['available'] = next_line if next_line
|
259
|
+
when "Physical Education Teacher"
|
260
|
+
data['academic']['sports']['instructors']['available'] = next_line if next_line
|
261
|
+
|
262
|
+
# Safety measures
|
263
|
+
when "Fire Extinguisher"
|
264
|
+
data['facilities']['safety']['fire']['equipment']['extinguisher'] = next_line if next_line
|
265
|
+
when "Emergency Exit"
|
266
|
+
data['facilities']['safety']['emergency']['exits']['available'] = next_line if next_line
|
267
|
+
when "Security Guard"
|
268
|
+
data['facilities']['safety']['security']['personnel']['guard'] = next_line if next_line
|
269
|
+
|
270
|
+
# Committee meetings
|
271
|
+
when "SMC Meetings Conducted"
|
272
|
+
data['committees']['smc']['details']['meetings']['count'] = next_line.to_i if next_line =~ /^\d+$/
|
273
|
+
when "SMDC Meetings Conducted"
|
274
|
+
data['committees']['smdc']['details']['meetings']['count'] = next_line.to_i if next_line =~ /^\d+$/
|
275
|
+
|
276
|
+
# Vocational courses
|
277
|
+
when "Vocational Courses"
|
278
|
+
data['academic']['vocational']['courses']['available'] = next_line if next_line
|
279
|
+
when "Vocational Trainer"
|
280
|
+
data['academic']['vocational']['trainers']['available'] = next_line if next_line
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# Clean up empty sections
|
285
|
+
data.each do |_, section|
|
286
|
+
if section.is_a?(Hash)
|
287
|
+
section.reject! { |_, v| v.nil? || (v.is_a?(Hash) && v.empty?) }
|
288
|
+
end
|
289
|
+
end
|
290
|
+
data.reject! { |_, v| v.nil? || (v.is_a?(Hash) && v.empty?) }
|
291
|
+
|
292
|
+
data
|
293
|
+
end
|
294
|
+
end
|
295
|
+
end
|