udise_school_report_reader 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +20 -0
- data/LICENSE.txt +21 -0
- data/README.md +45 -0
- data/lib/udise_school_report_reader/activities_data_reader.rb +58 -0
- data/lib/udise_school_report_reader/anganwadi_data_reader.rb +22 -0
- data/lib/udise_school_report_reader/basic_info_data_reader.rb +29 -0
- data/lib/udise_school_report_reader/block_rectangle_combiner.rb +115 -0
- data/lib/udise_school_report_reader/building_data_reader.rb +36 -0
- data/lib/udise_school_report_reader/characteristics_reader.rb +28 -0
- data/lib/udise_school_report_reader/csv_writer.rb +75 -0
- data/lib/udise_school_report_reader/data_reader_base.rb +86 -0
- data/lib/udise_school_report_reader/digital_facilities_data_reader.rb +42 -0
- data/lib/udise_school_report_reader/enrollment_data_reader.rb +136 -0
- data/lib/udise_school_report_reader/enrollment_html_writer.rb +81 -0
- data/lib/udise_school_report_reader/enrollment_yaml_writer.rb +62 -0
- data/lib/udise_school_report_reader/ews_data_reader.rb +118 -0
- data/lib/udise_school_report_reader/ews_html_writer.rb +63 -0
- data/lib/udise_school_report_reader/ews_yaml_writer.rb +31 -0
- data/lib/udise_school_report_reader/location_data_reader.rb +47 -0
- data/lib/udise_school_report_reader/official_data_reader.rb +40 -0
- data/lib/udise_school_report_reader/pdf_block_extractor.rb +49 -0
- data/lib/udise_school_report_reader/pdf_content_compressor.rb +36 -0
- data/lib/udise_school_report_reader/pdf_rectangle_extractor.rb +53 -0
- data/lib/udise_school_report_reader/rooms_data_reader.rb +36 -0
- data/lib/udise_school_report_reader/rte_data_reader.rb +118 -0
- data/lib/udise_school_report_reader/rte_html_writer.rb +63 -0
- data/lib/udise_school_report_reader/rte_yaml_writer.rb +61 -0
- data/lib/udise_school_report_reader/sanitation_data_reader.rb +56 -0
- data/lib/udise_school_report_reader/school_report_parser.rb +295 -0
- data/lib/udise_school_report_reader/teacher_data_reader.rb +204 -0
- data/lib/udise_school_report_reader/version.rb +3 -0
- data/lib/udise_school_report_reader.rb +41 -0
- data/test/school_report_parser_test.rb +62 -0
- metadata +165 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
class PDFRectangleExtractor
|
2
|
+
def self.extract_rectangles(reader)
|
3
|
+
raise ArgumentError, "PDF reader cannot be nil" if reader.nil?
|
4
|
+
|
5
|
+
rectangles = []
|
6
|
+
current_color = '0 G' # Default stroke color (black)
|
7
|
+
current_fill_color = '1 1 1 rg' # Default fill color (white)
|
8
|
+
current_line_width = 1.0 # Default line width
|
9
|
+
|
10
|
+
reader.pages.each_with_index do |page, index|
|
11
|
+
page_number = index + 1
|
12
|
+
|
13
|
+
page.raw_content.each_line do |line|
|
14
|
+
# Track stroke color changes
|
15
|
+
if line.match?(/[\d.]+ [\d.]+ [\d.]+ RG/) || line.match?(/[\d.]+ G/)
|
16
|
+
current_color = line.strip
|
17
|
+
end
|
18
|
+
|
19
|
+
# Track fill color changes
|
20
|
+
if line.match?(/[\d.]+ [\d.]+ [\d.]+ rg/) || line.match?(/[\d.]+ g/)
|
21
|
+
current_fill_color = line.strip
|
22
|
+
end
|
23
|
+
|
24
|
+
# Track line width changes
|
25
|
+
if line.match?(/[\d.]+\s+w/)
|
26
|
+
if match = line.match(/(\d+\.?\d*)\s+w/)
|
27
|
+
current_line_width = match[1].to_f
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Look for rectangles (table cells)
|
32
|
+
if line.match?(/(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+re/)
|
33
|
+
matches = line.match(/(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+re/)
|
34
|
+
x, y, width, height = matches[1..4].map(&:to_f)
|
35
|
+
|
36
|
+
# Store the rectangle with its properties
|
37
|
+
rectangles << {
|
38
|
+
page: page_number,
|
39
|
+
x: x,
|
40
|
+
y: y,
|
41
|
+
width: width,
|
42
|
+
height: height,
|
43
|
+
stroke_color: current_color,
|
44
|
+
fill_color: current_fill_color,
|
45
|
+
line_width: current_line_width
|
46
|
+
}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
rectangles
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require_relative 'data_reader_base'
|
2
|
+
|
3
|
+
class RoomsDataReader
|
4
|
+
include DataReaderBase
|
5
|
+
|
6
|
+
FIELD_MAPPINGS = {
|
7
|
+
'In Good Condition' => {
|
8
|
+
key_path: ['rooms', 'classrooms', 'good_condition'],
|
9
|
+
value_type: :integer,
|
10
|
+
end_pattern: /Needs Minor/
|
11
|
+
},
|
12
|
+
'Needs Minor Repair' => {
|
13
|
+
key_path: ['rooms', 'classrooms', 'needs_minor_repair'],
|
14
|
+
value_type: :integer,
|
15
|
+
end_pattern: /Needs Major/
|
16
|
+
},
|
17
|
+
'Needs Major Repair' => {
|
18
|
+
key_path: ['rooms', 'classrooms', 'needs_major_repair'],
|
19
|
+
value_type: :integer,
|
20
|
+
end_pattern: /Other Rooms/
|
21
|
+
},
|
22
|
+
'Other Rooms' => {
|
23
|
+
key_path: ['rooms', 'other'],
|
24
|
+
value_type: :integer,
|
25
|
+
end_pattern: /Library/
|
26
|
+
},
|
27
|
+
'Library Availability' => {
|
28
|
+
key_path: ['rooms', 'library'],
|
29
|
+
end_pattern: /Solar/
|
30
|
+
},
|
31
|
+
'Separate Room for HM' => {
|
32
|
+
key_path: ['rooms', 'hm'],
|
33
|
+
end_pattern: /Drinking/
|
34
|
+
}
|
35
|
+
}
|
36
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
class RteDataReader
|
2
|
+
GRADES = [
|
3
|
+
'Pre-Pri.', 'Class I', 'Class II', 'Class III', 'Class IV', 'Class V',
|
4
|
+
'Class VI', 'Class VII', 'Class VIII'
|
5
|
+
]
|
6
|
+
|
7
|
+
def self.read(csv_path) = new(csv_path).read
|
8
|
+
|
9
|
+
def initialize(csv_path)
|
10
|
+
@csv_path = csv_path
|
11
|
+
@rows = Hash.new { |h, k| h[k] = [] }
|
12
|
+
|
13
|
+
# Group cells by rect_y and rect_x
|
14
|
+
CSV.foreach(@csv_path, headers: true) do |cell|
|
15
|
+
next unless cell['page'] == '1'
|
16
|
+
|
17
|
+
rect_y = cell['rect_y'].to_f
|
18
|
+
@rows[rect_y] << cell
|
19
|
+
end
|
20
|
+
|
21
|
+
# Find the title row
|
22
|
+
@title_row = @rows.find { |_, cells| cells.any? { |cell| cell&.dig('text')&.include?('Total no. of Students Enrolled Under Section 12 of the RTE Act In Private Unaided and Specified Category Schools') } }
|
23
|
+
|
24
|
+
title_y = @title_row&.first
|
25
|
+
return unless title_y
|
26
|
+
|
27
|
+
# Get all rows below title in descending order
|
28
|
+
rows_after_title = @rows.select { |y, _| y < title_y.to_f }
|
29
|
+
.sort_by(&:first)
|
30
|
+
.reverse
|
31
|
+
|
32
|
+
# Get the next 3 rows after title
|
33
|
+
return unless rows_after_title.size >= 3
|
34
|
+
|
35
|
+
@grades_row = rows_after_title[0].last
|
36
|
+
@bg_row = rows_after_title[1].last
|
37
|
+
@values_row = rows_after_title[2].last
|
38
|
+
|
39
|
+
# Sort cells within each row by x coordinate
|
40
|
+
[@grades_row, @bg_row].each do |row|
|
41
|
+
next unless row
|
42
|
+
row.sort_by! { |cell| cell['text_x'].to_f }
|
43
|
+
end
|
44
|
+
|
45
|
+
# For values row, ensure we have a value for each B/G pair
|
46
|
+
if @values_row && @bg_row
|
47
|
+
sorted_values = []
|
48
|
+
@bg_row.each_slice(2) do |b, g|
|
49
|
+
b_x = b['text_x'].to_f
|
50
|
+
g_x = g['text_x'].to_f
|
51
|
+
|
52
|
+
# Find or create value for boys
|
53
|
+
b_val = @values_row.find { |cell| (cell['text_x'].to_f - b_x).abs < 10.0 }
|
54
|
+
b_val ||= { 'text' => '-', 'text_x' => b_x }
|
55
|
+
sorted_values << b_val
|
56
|
+
|
57
|
+
# Find or create value for girls
|
58
|
+
g_val = @values_row.find { |cell| (cell['text_x'].to_f - g_x).abs < 10.0 }
|
59
|
+
g_val ||= { 'text' => '-', 'text_x' => g_x }
|
60
|
+
sorted_values << g_val
|
61
|
+
end
|
62
|
+
@values_row = sorted_values
|
63
|
+
end
|
64
|
+
|
65
|
+
# Normalize empty values to "-"
|
66
|
+
@values_row&.each { |cell| cell['text'] = '-' if cell['text'].strip.empty? }
|
67
|
+
|
68
|
+
# Ensure we have all grades
|
69
|
+
found_grades = @grades_row.map { |cell| cell['text'] }
|
70
|
+
missing_grades = GRADES - found_grades
|
71
|
+
if missing_grades.any?
|
72
|
+
# Removed puts statement
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def read
|
77
|
+
return nil unless @grades_row && @bg_row && @values_row
|
78
|
+
|
79
|
+
# Group B,G pairs, ensuring we have complete pairs
|
80
|
+
bg_pairs = {}
|
81
|
+
@bg_row.each_slice(2) do |pair|
|
82
|
+
next unless pair.size == 2 && pair[0] && pair[1] # Skip incomplete pairs
|
83
|
+
b, g = pair
|
84
|
+
x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
|
85
|
+
bg_pairs[x_mid] = [b, g]
|
86
|
+
end
|
87
|
+
|
88
|
+
# Match numbers to pairs
|
89
|
+
{
|
90
|
+
grade_rows: @grades_row,
|
91
|
+
bg_pairs: bg_pairs,
|
92
|
+
rte_numbers: match_numbers_to_pairs(@values_row, bg_pairs),
|
93
|
+
}
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
|
98
|
+
numbers = {}
|
99
|
+
remaining = remaining_numbers.dup
|
100
|
+
|
101
|
+
bg_pairs.each do |x_mid, bg_pair|
|
102
|
+
next unless bg_pair && bg_pair.size == 2 # Skip invalid pairs
|
103
|
+
b_x = bg_pair[0]['text_x'].to_f
|
104
|
+
g_x = bg_pair[1]['text_x'].to_f
|
105
|
+
|
106
|
+
# Find numbers closest to B and G positions
|
107
|
+
b_num = remaining.find { |cell| (cell['text_x'].to_f - b_x).abs < threshold }
|
108
|
+
remaining.delete(b_num) if b_num
|
109
|
+
|
110
|
+
g_num = remaining.find { |cell| (cell['text_x'].to_f - g_x).abs < threshold }
|
111
|
+
remaining.delete(g_num) if g_num
|
112
|
+
|
113
|
+
numbers[x_mid] = [b_num, g_num]
|
114
|
+
end
|
115
|
+
|
116
|
+
numbers
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
class RteHtmlWriter
|
2
|
+
def self.generate_html(data, html_path)
|
3
|
+
return unless data
|
4
|
+
|
5
|
+
grade_rows = data[:grade_rows]
|
6
|
+
bg_pairs = data[:bg_pairs]
|
7
|
+
|
8
|
+
categories = [
|
9
|
+
['EWS', data[:ews_numbers] || {}],
|
10
|
+
]
|
11
|
+
|
12
|
+
# Generate table rows for all categories
|
13
|
+
table_rows = categories.map do |category, numbers|
|
14
|
+
cells = bg_pairs.map do |x_mid, _|
|
15
|
+
nums = numbers[x_mid.to_s] || numbers[x_mid] || []
|
16
|
+
b_num = nums&.first
|
17
|
+
g_num = nums&.last
|
18
|
+
"<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
|
19
|
+
end.join
|
20
|
+
|
21
|
+
" <tr>\n" \
|
22
|
+
" <td class=\"category\">#{category}</td>\n" \
|
23
|
+
" #{cells}\n" \
|
24
|
+
" </tr>"
|
25
|
+
end.join("\n")
|
26
|
+
|
27
|
+
# Generate grade headers
|
28
|
+
grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
|
29
|
+
bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
|
30
|
+
|
31
|
+
html_content = <<~HTML
|
32
|
+
<!DOCTYPE html>
|
33
|
+
<html>
|
34
|
+
<head>
|
35
|
+
<title>Enrollment Table</title>
|
36
|
+
<style>
|
37
|
+
table { border-collapse: collapse; margin-top: 20px; width: 100%; }
|
38
|
+
th, td { border: 1px solid black; padding: 8px; text-align: center; }
|
39
|
+
.header { font-weight: bold; background-color: #f0f0f0; }
|
40
|
+
.grade { font-weight: bold; background-color: #e0e0e0; }
|
41
|
+
.bg-pair { background-color: #f8f8f8; }
|
42
|
+
.category { font-weight: bold; text-align: left; }
|
43
|
+
</style>
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
<h2>Enrolment (By Social Category)</h2>
|
47
|
+
<table>
|
48
|
+
<tr class="grade">
|
49
|
+
<th rowspan="2">Category</th>
|
50
|
+
#{grade_headers}
|
51
|
+
</tr>
|
52
|
+
<tr class="bg-pair">
|
53
|
+
#{bg_headers}
|
54
|
+
</tr>
|
55
|
+
#{table_rows}
|
56
|
+
</table>
|
57
|
+
</body>
|
58
|
+
</html>
|
59
|
+
HTML
|
60
|
+
|
61
|
+
File.write(html_path, html_content)
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
class RteYamlWriter
|
2
|
+
GRADE_MAPPING = {
|
3
|
+
'Pre-Pri.' => 'pre-pri.',
|
4
|
+
'Class I' => 'class_i',
|
5
|
+
'Class II' => 'class_ii',
|
6
|
+
'Class III' => 'class_iii',
|
7
|
+
'Class IV' => 'class_iv',
|
8
|
+
'Class V' => 'class_v',
|
9
|
+
'Class VI' => 'class_vi',
|
10
|
+
'Class VII' => 'class_vii',
|
11
|
+
'Class VIII' => 'class_viii',
|
12
|
+
'Class IX' => 'class_ix',
|
13
|
+
'Class X' => 'class_x',
|
14
|
+
'Class XI' => 'class_xi',
|
15
|
+
'Class XII' => 'class_xii'
|
16
|
+
}
|
17
|
+
|
18
|
+
def self.format_yaml(data)
|
19
|
+
return unless data
|
20
|
+
|
21
|
+
rte_data = { 'rte' => {} }
|
22
|
+
|
23
|
+
# Get grade names and their indices
|
24
|
+
grades = data[:grade_rows].map { |row| row['text'] }
|
25
|
+
grade_indices = {}
|
26
|
+
grades.each_with_index do |grade, idx|
|
27
|
+
grade_indices[idx] = GRADE_MAPPING[grade] || grade.downcase.gsub(/\s+/, '_')
|
28
|
+
end
|
29
|
+
|
30
|
+
# Initialize structure for each grade
|
31
|
+
grade_indices.values.each do |grade_key|
|
32
|
+
rte_data['rte'][grade_key] = {
|
33
|
+
'boys' => 0,
|
34
|
+
'girls' => 0
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# Fill in values
|
39
|
+
data[:rte_numbers].each do |x_mid, pair|
|
40
|
+
next unless pair && pair.size == 2
|
41
|
+
|
42
|
+
# Find corresponding grade index based on x position
|
43
|
+
grade_idx = grade_indices.keys.find do |idx|
|
44
|
+
x_start = data[:grade_rows][idx]['rect_x'].to_f
|
45
|
+
x_end = x_start + data[:grade_rows][idx]['rect_width'].to_f
|
46
|
+
x_mid >= x_start && x_mid <= x_end
|
47
|
+
end
|
48
|
+
|
49
|
+
next unless grade_idx && grade_indices[grade_idx]
|
50
|
+
|
51
|
+
grade_key = grade_indices[grade_idx]
|
52
|
+
boys_val = pair[0]&.dig('text')
|
53
|
+
girls_val = pair[1]&.dig('text')
|
54
|
+
|
55
|
+
rte_data['rte'][grade_key]['boys'] = boys_val == '-' ? 0 : boys_val.to_i
|
56
|
+
rte_data['rte'][grade_key]['girls'] = girls_val == '-' ? 0 : girls_val.to_i
|
57
|
+
end
|
58
|
+
|
59
|
+
rte_data
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require_relative 'data_reader_base'
|
2
|
+
|
3
|
+
class SanitationDataReader
|
4
|
+
include DataReaderBase
|
5
|
+
|
6
|
+
FIELD_MAPPINGS = {
|
7
|
+
'Handwash Near Toilet' => {
|
8
|
+
key_path: ['sanitation', 'handwash', 'near_toilet'],
|
9
|
+
end_pattern: /Handwash Facility/
|
10
|
+
},
|
11
|
+
'Handwash Facility for Meal' => {
|
12
|
+
key_path: ['sanitation', 'handwash', 'for_meal'],
|
13
|
+
end_pattern: /Total Class/
|
14
|
+
},
|
15
|
+
'Toilets' => {
|
16
|
+
key_path: ['sanitation', 'toilets'],
|
17
|
+
is_table: true,
|
18
|
+
table_config: {
|
19
|
+
sections: [
|
20
|
+
{
|
21
|
+
trigger: /Total.*CWSN/,
|
22
|
+
offset: 1,
|
23
|
+
fields: [
|
24
|
+
{ key: ['boys', 'total'], value_type: :integer },
|
25
|
+
{ key: ['girls', 'total'], value_type: :integer }
|
26
|
+
]
|
27
|
+
},
|
28
|
+
{
|
29
|
+
trigger: "Functional",
|
30
|
+
offset: 1,
|
31
|
+
fields: [
|
32
|
+
{ key: ['boys', 'functional'], value_type: :integer },
|
33
|
+
{ key: ['girls', 'functional'], value_type: :integer }
|
34
|
+
]
|
35
|
+
},
|
36
|
+
{
|
37
|
+
trigger: /CWSN Friendly/,
|
38
|
+
offset: 1,
|
39
|
+
fields: [
|
40
|
+
{ key: ['boys', 'cwsn'], value_type: :integer },
|
41
|
+
{ key: ['girls', 'cwsn'], value_type: :integer }
|
42
|
+
]
|
43
|
+
},
|
44
|
+
{
|
45
|
+
trigger: "Urinal",
|
46
|
+
offset: 1,
|
47
|
+
fields: [
|
48
|
+
{ key: ['boys', 'urinals'], value_type: :integer },
|
49
|
+
{ key: ['girls', 'urinals'], value_type: :integer }
|
50
|
+
]
|
51
|
+
}
|
52
|
+
]
|
53
|
+
}
|
54
|
+
}
|
55
|
+
}
|
56
|
+
end
|
@@ -0,0 +1,295 @@
|
|
1
|
+
require 'pdf-reader'
|
2
|
+
require 'yaml'
|
3
|
+
require 'csv'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'tempfile'
|
6
|
+
|
7
|
+
module UdiseSchoolReportReader
|
8
|
+
class SchoolReportParser
|
9
|
+
def self.extract_to_text(pdf_path, output_dir = nil, write_files = false)
|
10
|
+
raise ArgumentError, "PDF file not found" unless File.exist?(pdf_path)
|
11
|
+
|
12
|
+
# Extract all data first
|
13
|
+
extracted_data = extract_data(pdf_path)
|
14
|
+
|
15
|
+
# Write files if requested
|
16
|
+
write_output_files(pdf_path, extracted_data, output_dir) if write_files
|
17
|
+
|
18
|
+
# Return the YAML data
|
19
|
+
extracted_data[:yaml_data]
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def self.extract_data(pdf_path)
|
25
|
+
reader = PDF::Reader.new(pdf_path)
|
26
|
+
|
27
|
+
# Extract raw content
|
28
|
+
content = reader.pages.map(&:raw_content).join("\n")
|
29
|
+
compressed_content = PDFContentCompressor.compress(content)
|
30
|
+
|
31
|
+
# Extract blocks and rectangles
|
32
|
+
blocks = PDFBlockExtractor.extract_blocks(reader)
|
33
|
+
rectangles = PDFRectangleExtractor.extract_rectangles(reader)
|
34
|
+
combined_data = BlockRectangleCombiner.combine(blocks, rectangles)
|
35
|
+
|
36
|
+
# Extract YAML data
|
37
|
+
yaml_data = extract_data_points(compressed_content)
|
38
|
+
|
39
|
+
# Create temporary file for combined data
|
40
|
+
temp_file = Tempfile.new(['combined', '.csv'])
|
41
|
+
begin
|
42
|
+
CSVWriter.write_combined(combined_data, temp_file.path)
|
43
|
+
|
44
|
+
# Extract table data using the temp file
|
45
|
+
enrollment_data = EnrollmentDataReader.read(temp_file.path)
|
46
|
+
ews_data = EwsDataReader.read(temp_file.path)
|
47
|
+
rte_data = RteDataReader.read(temp_file.path)
|
48
|
+
|
49
|
+
# Format table data for YAML
|
50
|
+
yaml_data['enrollment_data'] = EnrollmentYamlWriter.format_yaml(enrollment_data) if enrollment_data
|
51
|
+
yaml_data['ews_data'] = EwsYamlWriter.format_yaml(ews_data)
|
52
|
+
yaml_data['rte_data'] = RteYamlWriter.format_yaml(rte_data)
|
53
|
+
|
54
|
+
{
|
55
|
+
content: content,
|
56
|
+
compressed_content: compressed_content,
|
57
|
+
blocks: blocks,
|
58
|
+
rectangles: rectangles,
|
59
|
+
combined_data: combined_data,
|
60
|
+
enrollment_data: enrollment_data,
|
61
|
+
ews_data: ews_data,
|
62
|
+
rte_data: rte_data,
|
63
|
+
yaml_data: yaml_data
|
64
|
+
}
|
65
|
+
ensure
|
66
|
+
temp_file.close
|
67
|
+
temp_file.unlink
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.write_output_files(pdf_path, data, output_dir)
|
72
|
+
paths = OutputPaths.new(pdf_path, output_dir)
|
73
|
+
|
74
|
+
# Write text files
|
75
|
+
File.write(paths.txt, data[:content])
|
76
|
+
File.write(paths.compressed_txt, data[:compressed_content])
|
77
|
+
|
78
|
+
# Write CSV files
|
79
|
+
CSVWriter.write_blocks(data[:blocks], paths.blocks_csv)
|
80
|
+
CSVWriter.write_rectangles(data[:rectangles], paths.rects_csv)
|
81
|
+
CSVWriter.write_combined(data[:combined_data], paths.combined_csv)
|
82
|
+
|
83
|
+
# Write HTML files
|
84
|
+
RteHtmlWriter.generate_html(data[:rte_data], paths.rte_html)
|
85
|
+
EnrollmentHtmlWriter.generate_html(data[:enrollment_data], paths.enrollment_html)
|
86
|
+
EwsHtmlWriter.generate_html(data[:ews_data], paths.ews_html)
|
87
|
+
|
88
|
+
# Write YAML file
|
89
|
+
File.write(paths.yaml, data[:yaml_data].to_yaml)
|
90
|
+
end
|
91
|
+
|
92
|
+
class OutputPaths
|
93
|
+
EXTENSIONS = {
|
94
|
+
txt: '.txt',
|
95
|
+
compressed_txt: '_compressed.txt',
|
96
|
+
blocks_csv: '_blocks.csv',
|
97
|
+
rects_csv: '_rects.csv',
|
98
|
+
combined_csv: '_combined.csv',
|
99
|
+
rte_html: '_rte.html',
|
100
|
+
enrollment_html: '_enrollment.html',
|
101
|
+
ews_html: '_ews.html',
|
102
|
+
yaml: '.yml'
|
103
|
+
}
|
104
|
+
|
105
|
+
def initialize(pdf_path, output_dir)
|
106
|
+
@pdf_path = pdf_path
|
107
|
+
@output_dir = output_dir
|
108
|
+
@base_name = File.basename(pdf_path, '.pdf')
|
109
|
+
end
|
110
|
+
|
111
|
+
EXTENSIONS.each do |name, ext|
|
112
|
+
define_method(name) do
|
113
|
+
if @output_dir
|
114
|
+
File.join(@output_dir, "#{@base_name}#{ext}")
|
115
|
+
elsif name == :yaml
|
116
|
+
File.join(File.dirname(@pdf_path), "#{@base_name}#{ext}")
|
117
|
+
else
|
118
|
+
tmp_dir = File.join(File.expand_path('.'), 'tmp')
|
119
|
+
FileUtils.mkdir_p(tmp_dir)
|
120
|
+
File.join(tmp_dir, "#{@base_name}#{ext}")
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.extract_data_points(compressed_content)
|
127
|
+
lines = compressed_content.split("\n").map { |line| line.strip.gsub(/\\/, '') } # Remove escape characters
|
128
|
+
|
129
|
+
# Load template as base structure
|
130
|
+
data = YAML.load_file('template.yml')
|
131
|
+
|
132
|
+
# Extract data using readers
|
133
|
+
basic_info_data = BasicInfoDataReader.read(lines)
|
134
|
+
location_data = LocationDataReader.read(lines)
|
135
|
+
official_data = OfficialDataReader.read(lines)
|
136
|
+
characteristics_data = CharacteristicsReader.read(lines)
|
137
|
+
digital_facilities_data = DigitalFacilitiesDataReader.read(lines)
|
138
|
+
anganwadi_data = AnganwadiDataReader.read(lines)
|
139
|
+
building_data = BuildingDataReader.read(lines)
|
140
|
+
rooms_data = RoomsDataReader.read(lines)
|
141
|
+
teacher_data = TeacherDataReader.read(lines)
|
142
|
+
sanitation_data = SanitationDataReader.read(lines)
|
143
|
+
activities_data = ActivitiesDataReader.read(lines)
|
144
|
+
|
145
|
+
# Merge data from readers
|
146
|
+
data.merge!(basic_info_data) if basic_info_data
|
147
|
+
data.merge!(location_data) if location_data
|
148
|
+
data.merge!(official_data) if official_data
|
149
|
+
data.merge!(characteristics_data) if characteristics_data
|
150
|
+
data.merge!(digital_facilities_data) if digital_facilities_data
|
151
|
+
data.merge!(anganwadi_data) if anganwadi_data
|
152
|
+
data.merge!(building_data) if building_data
|
153
|
+
data.merge!(rooms_data) if rooms_data
|
154
|
+
data.merge!(teacher_data) if teacher_data
|
155
|
+
data.merge!(activities_data) if activities_data
|
156
|
+
data.merge!(sanitation_data) if sanitation_data
|
157
|
+
|
158
|
+
lines.each_with_index do |line, i|
|
159
|
+
next_line = lines[i + 1]&.strip
|
160
|
+
|
161
|
+
case line
|
162
|
+
# Basic Facilities
|
163
|
+
when "Drinking Water Available"
|
164
|
+
data['facilities']['basic']['water']['available'] = next_line if next_line && !next_line.match?(/Drinking Water Fun/)
|
165
|
+
when "Drinking Water Functional"
|
166
|
+
data['facilities']['basic']['water']['functional'] = next_line if next_line && !next_line.match?(/Rain/)
|
167
|
+
when "Rain Water Harvesting"
|
168
|
+
data['facilities']['basic']['water']['rain_water_harvesting'] = next_line if next_line && !next_line.match?(/Playground/)
|
169
|
+
when "Playground Available"
|
170
|
+
data['facilities']['basic']['safety']['playground'] = next_line if next_line && !next_line.match?(/Furniture/)
|
171
|
+
when "Electricity Availability"
|
172
|
+
data['facilities']['basic']['electricity']['available'] = next_line if next_line && !next_line.match?(/Solar/)
|
173
|
+
when "Solar Panel"
|
174
|
+
data['facilities']['basic']['electricity']['solar_panel'] = next_line if next_line && !next_line.match?(/Medical/)
|
175
|
+
when "Furniture Availability"
|
176
|
+
if next_line =~ /^\d+$/
|
177
|
+
data['infrastructure']['furniture']['count'] = next_line.to_i
|
178
|
+
end
|
179
|
+
|
180
|
+
# Academic
|
181
|
+
when /^Medium (\d)$/
|
182
|
+
medium_num = $1
|
183
|
+
if next_line && next_line =~ /^(\d+)-(.+)$/
|
184
|
+
code = $1
|
185
|
+
name = $2.strip
|
186
|
+
data['academic']['medium_of_instruction']["medium_#{medium_num}"] = {
|
187
|
+
'code' => code,
|
188
|
+
'name' => name
|
189
|
+
}
|
190
|
+
end
|
191
|
+
|
192
|
+
when "CCE"
|
193
|
+
if next_line
|
194
|
+
data['academic']['assessments']['cce']['implemented']['primary'] = next_line
|
195
|
+
data['academic']['assessments']['cce']['implemented']['upper_primary'] = lines[i + 2] if lines[i + 2]
|
196
|
+
data['academic']['assessments']['cce']['implemented']['secondary'] = lines[i + 3] if lines[i + 3]
|
197
|
+
data['academic']['assessments']['cce']['implemented']['higher_secondary'] = lines[i + 4] if lines[i + 4]
|
198
|
+
end
|
199
|
+
|
200
|
+
# Residential Info
|
201
|
+
when "Residential School"
|
202
|
+
if next_line && next_line =~ /^(\d+)\s*-\s*(.+)$/
|
203
|
+
code = $1
|
204
|
+
type = $2.strip
|
205
|
+
data['facilities']['residential']['details']['type'] = "#{code} - #{type}"
|
206
|
+
end
|
207
|
+
when "Residential Type"
|
208
|
+
data['facilities']['residential']['details']['category'] = next_line if next_line && !next_line.match?(/Minority/)
|
209
|
+
when "Minority School"
|
210
|
+
data['facilities']['residential']['details']['minority_school'] = next_line if next_line && !next_line.match?(/Approachable/)
|
211
|
+
when "Approachable By All Weather Road"
|
212
|
+
data['facilities']['basic']['safety']['all_weather_road'] = next_line if next_line && !next_line.match?(/Toilets/)
|
213
|
+
|
214
|
+
# Student Facilities
|
215
|
+
when /No\.of Students Received/
|
216
|
+
# Skip the header lines
|
217
|
+
i += 2 # Skip "Primary" and "Up.Primary" lines
|
218
|
+
when /Free text books/
|
219
|
+
if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
|
220
|
+
data['students']['facilities']['incentives']['free_textbooks']['primary'] = lines[i + 1].to_i
|
221
|
+
data['students']['facilities']['incentives']['free_textbooks']['upper_primary'] = lines[i + 2].to_i
|
222
|
+
end
|
223
|
+
when /Transport/
|
224
|
+
if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
|
225
|
+
data['students']['facilities']['general']['transport']['primary'] = lines[i + 1].to_i
|
226
|
+
data['students']['facilities']['general']['transport']['upper_primary'] = lines[i + 2].to_i
|
227
|
+
end
|
228
|
+
when /Free uniform/
|
229
|
+
if lines[i + 1] =~ /^\d+$/ && lines[i + 2] =~ /^\d+$/
|
230
|
+
data['students']['facilities']['incentives']['free_uniform']['primary'] = lines[i + 1].to_i
|
231
|
+
data['students']['facilities']['incentives']['free_uniform']['upper_primary'] = lines[i + 2].to_i
|
232
|
+
end
|
233
|
+
|
234
|
+
# Committees
|
235
|
+
when "SMC Exists"
|
236
|
+
data['committees']['smc']['details']['exists'] = next_line if next_line && !next_line.match?(/SMC & SMDC/)
|
237
|
+
when "SMC & SMDC Same"
|
238
|
+
data['committees']['smc']['details']['same_as_smdc'] = next_line if next_line && !next_line.match?(/SMDC Con/)
|
239
|
+
when "SMDC Constituted"
|
240
|
+
data['committees']['smdc']['details']['constituted'] = next_line if next_line && !next_line.match?(/Text Books/)
|
241
|
+
|
242
|
+
# Grants
|
243
|
+
when "Grants Receipt"
|
244
|
+
if next_line =~ /^\d+\.?\d*$/
|
245
|
+
data['grants']['received']['amount'] = next_line.to_f
|
246
|
+
end
|
247
|
+
when "Grants Expenditure"
|
248
|
+
if next_line =~ /^\d+\.?\d*$/
|
249
|
+
data['grants']['expenditure']['amount'] = next_line.to_f
|
250
|
+
end
|
251
|
+
|
252
|
+
# Medical facilities
|
253
|
+
when "Medical checkups"
|
254
|
+
data['facilities']['medical']['checkups']['available'] = next_line if next_line
|
255
|
+
|
256
|
+
# Sports facilities
|
257
|
+
when "Sports Equipment"
|
258
|
+
data['academic']['sports']['equipment']['available'] = next_line if next_line
|
259
|
+
when "Physical Education Teacher"
|
260
|
+
data['academic']['sports']['instructors']['available'] = next_line if next_line
|
261
|
+
|
262
|
+
# Safety measures
|
263
|
+
when "Fire Extinguisher"
|
264
|
+
data['facilities']['safety']['fire']['equipment']['extinguisher'] = next_line if next_line
|
265
|
+
when "Emergency Exit"
|
266
|
+
data['facilities']['safety']['emergency']['exits']['available'] = next_line if next_line
|
267
|
+
when "Security Guard"
|
268
|
+
data['facilities']['safety']['security']['personnel']['guard'] = next_line if next_line
|
269
|
+
|
270
|
+
# Committee meetings
|
271
|
+
when "SMC Meetings Conducted"
|
272
|
+
data['committees']['smc']['details']['meetings']['count'] = next_line.to_i if next_line =~ /^\d+$/
|
273
|
+
when "SMDC Meetings Conducted"
|
274
|
+
data['committees']['smdc']['details']['meetings']['count'] = next_line.to_i if next_line =~ /^\d+$/
|
275
|
+
|
276
|
+
# Vocational courses
|
277
|
+
when "Vocational Courses"
|
278
|
+
data['academic']['vocational']['courses']['available'] = next_line if next_line
|
279
|
+
when "Vocational Trainer"
|
280
|
+
data['academic']['vocational']['trainers']['available'] = next_line if next_line
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# Clean up empty sections
|
285
|
+
data.each do |_, section|
|
286
|
+
if section.is_a?(Hash)
|
287
|
+
section.reject! { |_, v| v.nil? || (v.is_a?(Hash) && v.empty?) }
|
288
|
+
end
|
289
|
+
end
|
290
|
+
data.reject! { |_, v| v.nil? || (v.is_a?(Hash) && v.empty?) }
|
291
|
+
|
292
|
+
data
|
293
|
+
end
|
294
|
+
end
|
295
|
+
end
|